Exemplo n.º 1
0
def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
    output_dir = args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log("loaded model at {}".format(checkpoint_path))
    except:
        raise RuntimeError(
            "Failed to load checkpoint at {}".format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            "Defined synthesis batch size {} is smaller than minimum required {} "
            "(num_gpus)! Please verify your synthesis batch size choice.".
            format(hparams.tacotron_synthesis_batch_size,
                   hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            "Defined synthesis batch size {} is not a multiple of {} (num_gpus)! "
            "Please verify your synthesis batch size choice!".format(
                hparams.tacotron_synthesis_batch_size,
                hparams.tacotron_num_gpus))

    if args.mode == "eval":
        return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
    elif args.mode == "synthesis":
        return run_synthesis(args, checkpoint_path, output_dir, hparams)
    else:
        run_live(args, checkpoint_path, hparams)
Exemplo n.º 2
0
    def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron", seed=None):
        log("Constructing model: %s" % model_name)

        # Initialize tensorflow random number seed for deterministic operation if provided
        if seed is not None:
            tf.compat.v1.set_random_seed(seed)

        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs")
        input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths")
        speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
                                            name="speaker_embeddings")
        targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
        split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
        with tf.compat.v1.variable_scope("Tacotron_model") as scope:
            self.model = create_model(model_name, hparams)
            if gta:
                self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
                                      split_infos=split_infos)
            else:
                self.model.initialize(inputs, input_lengths, speaker_embeddings,
                                      split_infos=split_infos)
            
            self.mel_outputs = self.model.tower_mel_outputs
            self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
            self.alignments = self.model.tower_alignments
            self.stop_token_prediction = self.model.tower_stop_token_prediction
            self.targets = targets
        
        self.gta = gta
        self._hparams = hparams
        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn"t originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        
        self.inputs = inputs
        self.input_lengths = input_lengths
        self.speaker_embeddings = speaker_embeddings
        self.targets = targets
        self.split_infos = split_infos
        
        log("Loading checkpoint: %s" % checkpoint_path)
        #Memory allocation on the GPUs as needed
        config = tf.compat.v1.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        
        self.session = tf.compat.v1.Session(config=config)
        self.session.run(tf.compat.v1.global_variables_initializer())
        
        saver = tf.compat.v1.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemplo n.º 3
0
    def make_test_batches(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.tacotron_batch_size
        r = self._hparams.outputs_per_step

        #Test on entire test set
        examples = [self._get_test_groups() for i in range(1)]

        # Bucket examples based on similar output sequence length for efficiency
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log("\nGenerated %d test batches of size %d in %.3f sec" %
            (len(batches), n, time.time() - start))
        return batches, r
Exemplo n.º 4
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.tacotron_batch_size
            r = self._hparams.outputs_per_step
            examples = [self._get_next_example() for i in range(n * _batches_per_group)]

            # Bucket examples based on similar output sequence length for efficiency
            examples.sort(key=lambda x: x[-1])
            batches = [examples[i: i + n] for i in range(0, len(examples), n)]
            np.random.shuffle(batches)

            log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start))
            for batch in batches:
                feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
                self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemplo n.º 5
0
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, "eval")
    log_dir = os.path.join(output_dir, "logs-eval")
    
    #Create output path if it doesn"t exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
    os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
    
    log(hparams_debug_string())
    synth = Tacotron2(checkpoint_path, hparams)
    
    #Set inputs batch wise
    sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 
                 in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
    
    log("Starting Synthesis")
    with open(os.path.join(eval_dir, "map.txt"), "w") as file:
        for i, texts in enumerate(tqdm(sentences)):
            start = time.time()
            basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
            mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
            
            for elems in zip(texts, mel_filenames, speaker_ids):
                file.write("|".join([str(x) for x in elems]) + "\n")
    log("synthesized mel spectrograms at {}".format(eval_dir))
    return eval_dir
Exemplo n.º 6
0
def run_live(args, checkpoint_path, hparams):
    #Log to Terminal without keeping any records in files
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #Generate fast greeting message
    greetings = "Hello, Welcome to the Live testing tool. Please type a message and I will try " \
                "to read it!"
    log(greetings)
    generate_fast(synth, greetings)

    #Interaction loop
    while True:
        try:
            text = input()
            generate_fast(synth, text)

        except KeyboardInterrupt:
            leave = "Thank you for testing our features. see you soon."
            log(leave)
            generate_fast(synth, leave)
            sleep(2)
            break
Exemplo n.º 7
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   embed_targets,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
        Initializes the model for inference sets "mel_outputs" and "alignments" fields.
        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the 
            lengths of each sequence in inputs.
            - embed_targets: float32 Tensor with shape [N, E] where E is the speaker 
            embedding size.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, 
            T_out is number of steps in the output time series, M is num_mels, and values are 
            entries in the mel spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                "no multi targets were provided but token_targets were given")
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                "Mel targets are provided without corresponding token_targets")
        if not gta and self._hparams.predict_linear == True and linear_targets is None and \
                is_training:
            raise ValueError(
                "Model is set to use post processing to predict linear spectrograms in training "
                "but no linear targets given!")
        if gta and linear_targets is not None:
            raise ValueError(
                "Linear spectrogram prediction is not supported in GTA mode!")
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                "Model set to mask paddings but no targets lengths provided for the mask!"
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                "Model can not be in training and evaluation modes at the same time!"
            )

        split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \
            self._hparams.split_on_cpu else "/gpu:{}".format(
                self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = \
                tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \
                targets_lengths is not None else targets_lengths

            ### SV2TTS ###

            tower_embed_targets = tf.split(
                embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0)

            ##############

            p_inputs = tf.numpy_function(split_func,
                                         [inputs, split_infos[:, 0]], lout_int)
            p_mel_targets = tf.numpy_function(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]],
                                                     lout_float) if stop_token_targets is not None else \
                stop_token_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_cond_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.compat.v1.train.replica_device_setter(
                        ps_tasks=1, ps_device="/cpu:0",
                        worker_device=gpus[i])):
                with tf.compat.v1.variable_scope("inference") as scope:
                    assert hp.tacotron_teacher_forcing_mode in ("constant",
                                                                "scheduled")
                    if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
                        assert global_step is not None

                    # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit
                    # post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.compat.v1.get_variable(
                        "inputs_embedding", [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope="encoder_convolutions"),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope="encoder_LSTM"))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    # For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    ### SV2TT2 ###

                    # Append the speaker embedding to the encoder output at each timestep
                    tileable_shape = [
                        -1, 1, self._hparams.speaker_embedding_size
                    ]
                    tileable_embed_targets = tf.reshape(
                        tower_embed_targets[i], tileable_shape)
                    tiled_embed_targets = tf.tile(
                        tileable_embed_targets,
                        [1, tf.shape(encoder_outputs)[1], 1])
                    encoder_cond_outputs = tf.concat(
                        (encoder_outputs, tiled_embed_targets), 2)

                    ##############

                    # Decoder Parts
                    # Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope="decoder_prenet")
                    # Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_cond_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    # Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope="decoder_LSTM")
                    # Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope="linear_transform_projection")
                    # <stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope="stop_token_projection")

                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    # Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    # initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    # Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    # Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    # Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope="postnet_convolutions")

                    # Compute residual using post-net ==> [batch_size, decoder_steps * r,
                    # postnet_channels]
                    residual = postnet(decoder_output)

                    # Project residual to same dimension as mel spectrogram
                    # ==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope="postnet_projection")
                    projected_residual = residual_projection(residual)

                    # Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features
                        # from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name="CBHG_postnet")

                        # [batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        # Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope="cbhg_linear_specs_projection")

                        # [batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    # Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_cond_outputs.append(encoder_cond_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log("initialisation done {}".format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        # self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.compat.v1.trainable_variables()

        log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
        log("  Train mode:               {}".format(is_training))
        log("  Eval mode:                {}".format(is_evaluating))
        log("  GTA mode:                 {}".format(gta))
        log("  Synthesis mode:           {}".format(not (
            is_training or is_evaluating)))
        log("  Input:                    {}".format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log("  device:                   {}".format(i))
            log("  embedding:                {}".format(
                tower_embedded_inputs[i].shape))
            log("  enc conv out:             {}".format(
                tower_enc_conv_output_shape[i]))
            log("  encoder out (cond):       {}".format(
                tower_encoder_cond_outputs[i].shape))
            log("  decoder out:              {}".format(
                self.tower_decoder_output[i].shape))
            log("  residual out:             {}".format(
                tower_residual[i].shape))
            log("  projected residual out:   {}".format(
                tower_projected_residual[i].shape))
            log("  mel out:                  {}".format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log("  linear out:               {}".format(
                    self.tower_linear_outputs[i].shape))
            log("  <stop_token> out:         {}".format(
                self.tower_stop_token_prediction[i].shape))

            # 1_000_000 is causing syntax problems for some people?! Python please :)
            log("  Tacotron Parameters       {:.3f} Million.".format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemplo n.º 8
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
        self._train_offset = 0
        self._test_offset = 0

        # Load metadata
        self._ppg_dir = os.path.join(os.path.dirname(metadata_filename),
                                     "ppgs")
        self._mel_dir = os.path.join(os.path.dirname(metadata_filename),
                                     "mels")
        self._embed_dir = os.path.join(os.path.dirname(metadata_filename),
                                       "embeds")
        with open(metadata_filename, encoding="utf-8") as f:
            self._metadata = [line.strip().split("|") for line in f]
            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            hours = sum([int(x[5])
                         for x in self._metadata]) * frame_shift_ms / (3600)
            log("Loaded metadata for {} examples ({:.2f} hours)".format(
                len(self._metadata), hours))

        # Train test split
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches is not None

        test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size
                     is not None else hparams.tacotron_test_batches *
                     hparams.tacotron_batch_size)
        indices = np.arange(len(self._metadata))
        train_indices, test_indices = train_test_split(
            indices,
            test_size=test_size,
            random_state=hparams.tacotron_data_random_state)

        # Make sure test_indices is a multiple of batch_size else round up
        len_test_indices = self._round_down(len(test_indices),
                                            hparams.tacotron_batch_size)
        extra_test = test_indices[len_test_indices:]
        test_indices = test_indices[:len_test_indices]
        train_indices = np.concatenate([train_indices, extra_test])

        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])

        self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size

        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches == self.test_steps

        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        # explicitely setting the padding to a value that doesn"t originally exist in the spectogram
        # to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        # Mark finished sequences with 1s
        self._token_pad = 1.

        with tf.device("/cpu:0"):
            # Create placeholders for inputs and targets. Don"t specify batch size because we want
            # to be able to feed different batch sizes at eval time.
            self._placeholders = [
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_ppgs),
                               name="inputs"),
                tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_mels),
                               name="mel_targets"),
                tf.placeholder(tf.float32,
                               shape=(None, None),
                               name="token_targets"),
                tf.placeholder(tf.int32,
                               shape=(None, ),
                               name="targets_lengths"),
                tf.placeholder(tf.int32,
                               shape=(hparams.tacotron_num_gpus, None),
                               name="split_infos"),

                # SV2TTS
                tf.placeholder(tf.float32,
                               shape=(None, hparams.speaker_embedding_size),
                               name="speaker_embeddings"),

                # adversial speaker classifier
                tf.placeholder(tf.int32, shape=(None, ), name="speaker_labels")
            ]

            # Create queue for buffering data
            queue = tf.FIFOQueue(8, [
                tf.float32, tf.int32, tf.float32, tf.float32, tf.int32,
                tf.int32, tf.float32, tf.int32
            ],
                                 name="input_queue")
            self._enqueue_op = queue.enqueue(self._placeholders)
            self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
            self.targets_lengths, self.split_infos, self.speaker_embeddings, self.speaker_labels = queue.dequeue()

            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.mel_targets.set_shape(self._placeholders[2].shape)
            self.token_targets.set_shape(self._placeholders[3].shape)
            self.targets_lengths.set_shape(self._placeholders[4].shape)
            self.split_infos.set_shape(self._placeholders[5].shape)
            self.speaker_embeddings.set_shape(self._placeholders[6].shape)
            self.speaker_labels.set_shape(self._placeholders[7].shape)

            # Create eval queue for buffering eval data
            eval_queue = tf.FIFOQueue(1, [
                tf.float32, tf.int32, tf.float32, tf.float32, tf.int32,
                tf.int32, tf.float32, tf.int32
            ],
                                      name="eval_queue")
            self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
            self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
            self.eval_token_targets, self.eval_targets_lengths, \
            self.eval_split_infos, self.eval_speaker_embeddings, self.eval_speaker_labels = eval_queue.dequeue()

            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_mel_targets.set_shape(self._placeholders[2].shape)
            self.eval_token_targets.set_shape(self._placeholders[3].shape)
            self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
            self.eval_split_infos.set_shape(self._placeholders[5].shape)
            self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
            self.eval_speaker_labels.set_shape(self._placeholders[7].shape)
Exemplo n.º 9
0
    def __init__(self,
                 checkpoint_path,
                 hparams,
                 gta=False,
                 model_name="Tacotron"):
        log("Constructing model: %s" % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
        input_lengths = tf.placeholder(tf.int32, (None, ),
                                       name="input_lengths")
        speaker_embeddings = tf.placeholder(
            tf.float32, (None, hparams.speaker_embedding_size),
            name="speaker_embeddings")
        text_embeddings = tf.placeholder(
            tf.float32, (None, hparams.speaker_embedding_size),
            name="text_embeddings")
        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels),
                                 name="mel_targets")
        split_infos = tf.placeholder(tf.int32,
                                     shape=(hparams.tacotron_num_gpus, None),
                                     name="split_infos")
        global_step = tf.Variable(200000, name="global_step", trainable=False)

        with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
            # 200000 is the step of the ckpt being loaded
            self.model = create_model_ph(model_name, hparams)
            if gta:
                self.model.initialize(inputs,
                                      input_lengths,
                                      speaker_embeddings,
                                      text_embeddings,
                                      targets,
                                      gta=gta,
                                      split_infos=split_infos,
                                      global_step=global_step)
            else:
                self.model.initialize(inputs,
                                      input_lengths,
                                      speaker_embeddings,
                                      text_embeddings,
                                      split_infos=split_infos,
                                      global_step=global_step)

            self.mel_outputs = self.model.tower_mel_outputs
            self.linear_outputs = self.model.tower_linear_outputs if (
                hparams.predict_linear and not gta) else None
            self.alignments = self.model.tower_alignments
            self.stop_token_prediction = self.model.tower_stop_token_prediction
            self.targets = targets

        self.gta = gta
        self._hparams = hparams
        self._pad = 0
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.speaker_embeddings = speaker_embeddings
        self.text_embeddings = text_embeddings
        self.targets = targets
        self.split_infos = split_infos

        log("Loading checkpoint: %s" % checkpoint_path)
        #Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemplo n.º 10
0
    def __init__(self, coordinator, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._train_offset = 0
        self._test_offset = 0

        gparent_dir = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        data_dir = os.path.join(gparent_dir, 'data')
        clf_data_path = os.path.join(data_dir, 'clf_data.pkl')
        clf_data = load_pkl(clf_data_path)
        texts = clf_data['texts']
        mel_paths = clf_data['paths']

        emb_paths = [p.replace('mel-', 'mbed-') for p in mel_paths]
        emb_paths = [p.replace('stft', 'speaker_emb') for p in emb_paths]

        text_emb_paths = [p.replace('mel-', '') for p in mel_paths]
        text_emb_paths = [p.replace('stft', 'devise') for p in text_emb_paths]

        # Load metadata
        self._metadata = [[
            m, e, te, t
        ] for m, e, te, t in zip(mel_paths, emb_paths, text_emb_paths, texts)]
        # each element is size-3 list, ie [mel, embed, text]
        log("Loaded metadata for %d examples" % len(self._metadata))

        # Train test split
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches is not None

        test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size
                     is not None else hparams.tacotron_test_batches *
                     hparams.tacotron_batch_size)
        indices = np.arange(len(self._metadata))
        train_indices, test_indices = train_test_split(
            indices,
            test_size=test_size,
            random_state=hparams.tacotron_data_random_state)

        # Make sure test_indices is a multiple of batch_size else round up
        len_test_indices = self._round_down(len(test_indices),
                                            hparams.tacotron_batch_size)
        extra_test = test_indices[len_test_indices:]
        test_indices = test_indices[:len_test_indices]
        train_indices = np.concatenate([train_indices, extra_test])

        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])

        self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size

        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches == self.test_steps

        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        self._token_pad = 1.

        with tf.device("/cpu:0"):
            # Create placeholders for inputs and targets. Don"t specify batch size because we want
            # to be able to feed different batch sizes at eval time.
            self._placeholders = [
                tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
                tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_mels),
                               name="mel_targets"),
                tf.placeholder(tf.float32,
                               shape=(None, None),
                               name="token_targets"),
                tf.placeholder(tf.int32,
                               shape=(None, ),
                               name="targets_lengths"),
                tf.placeholder(tf.int32,
                               shape=(hparams.tacotron_num_gpus, None),
                               name="split_infos"),

                # SV2TTS
                tf.placeholder(tf.float32,
                               shape=(None, hparams.speaker_embedding_size),
                               name="speaker_embeddings"),
                tf.placeholder(tf.float32,
                               shape=(None, hparams.speaker_embedding_size),
                               name="text_embeddings")
            ]

            # Create queue for buffering data
            queue = tf.FIFOQueue(8, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32,
                tf.float32, tf.float32
            ],
                                 name="input_queue")
            self._enqueue_op = queue.enqueue(self._placeholders)
            self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
                self.targets_lengths, self.split_infos, self.speaker_embeddings, \
                self.text_embeddings = queue.dequeue()

            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.mel_targets.set_shape(self._placeholders[2].shape)
            self.token_targets.set_shape(self._placeholders[3].shape)
            self.targets_lengths.set_shape(self._placeholders[4].shape)
            self.split_infos.set_shape(self._placeholders[5].shape)
            self.speaker_embeddings.set_shape(self._placeholders[6].shape)
            self.text_embeddings.set_shape(self._placeholders[7].shape)

            # Create eval queue for buffering eval data
            eval_queue = tf.FIFOQueue(1, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32,
                tf.float32, tf.float32
            ],
                                      name="eval_queue")
            self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
            self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
                self.eval_token_targets, self.eval_targets_lengths, \
                self.eval_split_infos, self.eval_speaker_embeddings, \
                self.eval_text_embeddings = eval_queue.dequeue()

            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_mel_targets.set_shape(self._placeholders[2].shape)
            self.eval_token_targets.set_shape(self._placeholders[3].shape)
            self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
            self.eval_split_infos.set_shape(self._placeholders[5].shape)
            self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
            self.eval_text_embeddings.set_shape(self._placeholders[7].shape)
Exemplo n.º 11
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(Feeder, self).__init__()
        self.encoder_path = Path(hparams.encoder_path)
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = hparams.cleaners
        self._train_offset = 0
        self._test_offset = 0

        # Load metadata
        self._audio_dir = os.path.join(os.path.dirname(metadata_filename), "audio")
        self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels")
        self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds")
        with open(metadata_filename, encoding="utf8") as fin:
            self._metadata = []
            for line in tqdm(fin, ncols=50, mininterval=2):
                # 支持相对路径和绝对路径
                # ../data/samples/aliaudio/Aibao/005397.mp3|mel-aliaudio-Aibao-005397.mp3.npy|embed-aliaudio-Aibao-005397.mp3.npy|64403|254|他走近钢琴并开始演奏“祖国从哪里开始”。
                audio_path, mel_path, embed_path, audio_size, mel_size, text = line.strip().split("|")
                if not os.path.exists(audio_path):
                    audio_path = os.path.join(self._audio_dir, audio_path)
                if not os.path.exists(mel_path):
                    mel_path = os.path.join(self._mel_dir, mel_path)
                if not os.path.exists(embed_path):
                    embed_path = os.path.join(self._embed_dir, embed_path)

                if os.path.exists(audio_path) and os.path.exists(mel_path) and os.path.exists(embed_path):
                    self._metadata.append([audio_path, mel_path, embed_path, audio_size, mel_size, text])
                else:
                    print("Load data failed!")
                    print("data:", line)

            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
            log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours))

        # Train test split
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches is not None

        test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
                     else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
        indices = np.arange(len(self._metadata))
        train_indices, test_indices = train_test_split(indices,
                                                       test_size=test_size,
                                                       random_state=hparams.tacotron_data_random_state)

        # Make sure test_indices is a multiple of batch_size else round up
        len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
        extra_test = test_indices[len_test_indices:]
        test_indices = test_indices[:len_test_indices]
        train_indices = np.concatenate([train_indices, extra_test])

        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])
        np.random.shuffle(self._train_meta)
        np.random.shuffle(self._test_meta)
        self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size

        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches == self.test_steps

        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        # explicitely setting the padding to a value that doesn"t originally exist in the spectogram
        # to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        # Mark finished sequences with 1s
        self._token_pad = 1.

        with tf.device("/cpu:0"):
            # Create placeholders for inputs and targets. Don"t specify batch size because we want
            # to be able to feed different batch sizes at eval time.
            self._placeholders = [
                tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
                tf.placeholder(tf.int32, shape=(None,), name="input_lengths"),
                tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels),
                               name="mel_targets"),
                tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
                tf.placeholder(tf.int32, shape=(None,), name="targets_lengths"),
                tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None),
                               name="split_infos"),

                # SV2TTS
                tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size),
                               name="speaker_embeddings")
            ]

            # Create queue for buffering data
            queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32,
                                     tf.int32, tf.int32, tf.float32], name="input_queue")
            self._enqueue_op = queue.enqueue(self._placeholders)
            self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
            self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue()

            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.mel_targets.set_shape(self._placeholders[2].shape)
            self.token_targets.set_shape(self._placeholders[3].shape)
            self.targets_lengths.set_shape(self._placeholders[4].shape)
            self.split_infos.set_shape(self._placeholders[5].shape)
            self.speaker_embeddings.set_shape(self._placeholders[6].shape)

            # Create eval queue for buffering eval data
            eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,
                                          tf.int32, tf.int32, tf.float32], name="eval_queue")
            self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
            self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
            self.eval_token_targets, self.eval_targets_lengths, \
            self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue()

            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_mel_targets.set_shape(self._placeholders[2].shape)
            self.eval_token_targets.set_shape(self._placeholders[3].shape)
            self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
            self.eval_split_infos.set_shape(self._placeholders[5].shape)
            self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)