Exemplo n.º 1
0
    def test_init(self):
        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_init")  # Add function name to path.

        trainer = AcousticModelTrainer(self.dir_world_features,
                                       self.dir_question_labels, self.id_list,
                                       hparams.num_questions, hparams)
        trainer.init(hparams)

        shutil.rmtree(hparams.out_dir)
Exemplo n.º 2
0
    def create_hparams(hparams_string=None, verbose=False):
        # Combine parameters needed for training acoustic and duration model.
        hparams = AcousticModelTrainer.create_hparams(hparams_string,
                                                      verbose=False)
        hparams_duration = DurationModelTrainer.create_hparams(hparams_string,
                                                               verbose=False)
        hparams.override_from_hparam(hparams_duration)

        # Add parameters required for full TTS.
        hparams.add_hparams(front_end=None,
                            front_end_accent=None,
                            festival_dir=None,
                            file_symbol_dict=None,
                            num_phoneme_states=None,
                            duration_labels_dir=None,
                            duration_norm_file_name=None,
                            duration_model=None,
                            question_labels_norm_file=None,
                            world_features_dir=None,
                            acoustic_model=None,
                            synth_load_org_lf0=False,
                            synth_load_org_vuv=False,
                            synth_load_org_bap=False)

        if verbose:
            logging.info(hparams.get_debug_string())

        return hparams
Exemplo n.º 3
0
    def _get_hparams(self):
        hparams = AcousticModelTrainer.create_hparams()
        # General parameters
        hparams.num_questions = 409
        hparams.voice = "full"
        hparams.data_dir = os.path.realpath(
            os.path.join("integration", "fixtures", "database"))
        hparams.out_dir = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            type(self).__name__)

        hparams.frame_size_ms = 5
        hparams.num_coded_sps = 20
        hparams.seed = 1

        # Training parameters.
        hparams.epochs = 3
        hparams.use_gpu = False
        hparams.model_type = "RNNDYN-1_RELU_32-1_FC_67"
        hparams.batch_size_train = 2
        hparams.batch_size_val = 50
        hparams.use_saved_learning_rate = True
        hparams.optimiser_args["lr"] = 0.001
        hparams.model_name = "test_model.nn"
        hparams.epochs_per_checkpoint = 2

        return hparams
Exemplo n.º 4
0
 def _get_acoustic_trainer(self, hparams):
     dir_world_features = os.path.join("integration", "fixtures", "WORLD")
     dir_question_labels = os.path.join("integration", "fixtures",
                                        "questions")
     return AcousticModelTrainer(dir_world_features, dir_question_labels,
                                 self.id_list, hparams.num_questions,
                                 hparams)
Exemplo n.º 5
0
    def test_benchmark(self):
        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_benchmark")  # Add function name to path.
        hparams.seed = 1

        trainer = AcousticModelTrainer(self.dir_world_features,
                                       self.dir_question_labels, self.id_list,
                                       hparams.num_questions, hparams)
        trainer.init(hparams)
        scores = trainer.benchmark(hparams)

        numpy.testing.assert_almost_equal((8.616, 78.4, 0.609, 37.352), scores,
                                          3, "Wrong benchmark score.")

        shutil.rmtree(hparams.out_dir)
Exemplo n.º 6
0
    def generate_audio_features(self, id_list,
                                hparams):  # TODO: This function is untested.
        """
        Generate mgc, vuv and bap data with an acoustic model.
        The name of the acoustic model is saved in hparams.synth_acoustic_model_path and given in the constructor.
        If the synth_acoustic_model_path is 'None' this method will not be called but the method
        load_extracted_audio_features, which reloads the original data extracted from the audio.

        If you want to generate audio directly from wcad atom extraction, uncomment the first block
        in the get_recon_from_synth_output method.

        Detailed execution process:
        This method reuses the synth method of the ModelTrainer base class. It overwrites the internal
        f_synthesize method and the OutputGen to accomplish the audio generation. Both are restored
        after finishing the generation. The base class synth method loads the acoustic model network
        by its name and forwards the question labels for each utterance in the id_list. At the
        end the method calls the f_synthesize method. Therefore the f_synthesize method is overwritten
        by the save_audio_features which saves the generate output mgc, vuv and bap files in the
        self.synth_dir folder.
        """
        self.logger.info("Generate mgc, vuv and bap with " +
                         hparams.synth_acoustic_model_path)

        acoustic_model_hparams = AcousticModelTrainer.create_hparams()
        acoustic_model_hparams.model_name = os.path.basename(
            hparams.synth_acoustic_model_path)
        acoustic_model_hparams.model_path = hparams.synth_acoustic_model_path
        acoustic_model_handler = AcousticModelTrainer(acoustic_model_hparams)

        org_model_handler = self.model_handler
        self.model_handler = acoustic_model_handler

        # Switch f_synthesize method and OutputGen for mgc, vuv and bap creation.
        # f_synthesize is called at the end of synth.
        self.f_synthesize = self.save_audio_features
        org_output_gen = self.OutputGen
        self.OutputGen = self.AudioGen

        # Explicitly synthesize with acoustic_model_name.
        # This method calls f_synthesize at the end which will save the mgc, vuv and bap.
        self.synth(hparams, id_list)

        # Switch back to atom creation.
        self.f_synthesize = self.synthesize
        self.OutputGen = org_output_gen
        self.model_handler = org_model_handler
Exemplo n.º 7
0
    def test_synth_wav(self):
        num_test_files = 2

        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_synth_wav")  # Add function name to path
        hparams.model_name = "test_model_in409_out67"
        hparams.model_path = os.path.join("integration", "fixtures",
                                          hparams.model_name,
                                          hparams.networks_dir)
        hparams.synth_fs = 16000
        hparams.frame_size_ms = 5
        hparams.synth_ext = "wav"
        hparams.synth_load_org_sp = True
        hparams.synth_load_org_lf0 = True
        hparams.synth_load_org_vuv = True
        hparams.synth_load_org_bap = True

        trainer = AcousticModelTrainer(
            **AcousticModelTrainer.legacy_support_init(
                self.dir_world_features, self.dir_question_labels,
                self.id_list, hparams.num_questions, hparams))
        trainer.init(hparams)
        hparams.synth_dir = os.path.join(hparams.out_dir, hparams.model_name)
        trainer.synth(hparams, self.id_list[:num_test_files])

        found_files = list([
            name for name in os.listdir(hparams.synth_dir)
            if os.path.isfile(os.path.join(hparams.synth_dir, name))
            and name.endswith("_WORLD." + hparams.synth_ext)
        ])
        # Check number of created files.
        self.assertEqual(
            len(self.id_list[:num_test_files]),
            len(found_files),
            msg="Number of {} files in synth_dir directory does not match.".
            format(hparams.synth_ext))

        # Check readability and length of one created file.
        raw, fs = soundfile.read(
            os.path.join(hparams.synth_dir, found_files[0]))
        self.assertEqual(
            hparams.synth_fs,
            fs,
            msg="Desired sampling frequency of output doesn't match.")
        labels = trainer.datareaders["acoustic_features"][[
            id_name for id_name in self.id_list[:num_test_files]
            if id_name in found_files[0]
        ][0]]
        expected_length = len(
            raw) / hparams.synth_fs / hparams.frame_size_ms * 1000
        self.assertTrue(
            abs(expected_length - len(labels["acoustic_features"])) < 10,
            msg=
            "Saved raw audio file length does not roughly match length of labels."
        )

        shutil.rmtree(hparams.out_dir)
Exemplo n.º 8
0
    def test_train(self):
        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_train")  # Add function name to path.
        hparams.seed = 1234
        hparams.use_best_as_final_model = False

        trainer = AcousticModelTrainer(self.dir_world_features,
                                       self.dir_question_labels, self.id_list,
                                       hparams.num_questions, hparams)
        trainer.init(hparams)
        _, all_loss_train, _ = trainer.train(hparams)

        # Training loss decreases?
        self.assertLess(all_loss_train[-1],
                        all_loss_train[1 if hparams.start_with_test else 0],
                        msg="Loss did not decrease over {} epochs".format(
                            hparams.epochs))

        shutil.rmtree(hparams.out_dir)
Exemplo n.º 9
0
    def create_hparams(hparams_string=None, verbose=False):

        hparams = AcousticModelTrainer.create_hparams(hparams_string,
                                                      verbose=False)
        hparams.add_deltas = False

        default_hparams = {
            # "n_output_channels": None,
            "load_sp": True,
            "load_lf0": True,
            "load_vuv": True,
            "load_bap": True,
            "n_symbols": None,
            "symbols_embedding_dim": 512,

            # # Encoder parameters
            #
            # # Decoder parameters
            "add_EOF_gate": False,
            "n_frames_per_step": 1,
            "single_encoder_input_per_step": False,
            "max_decoder_steps": 1000,  # 200*18,
            # "gate_threshold": 0.5,
            "p_text_encoder_dropout": 0.5,
            "p_audio_encoder_dropout": 0.5,
            # "p_attention_dropout": 0.4,
            "p_decoder_dropout": 0.1,
            "p_audio_decoder_dropout": 0.5,
            #
            # # Attention parameters
            # "attention_rnn_dim": 1024,
            # "attention_dim": 128,
            #
            # # Location Layer parameters
            # "attention_location_n_filters": 32,
            # "attention_location_kernel_size": 31,
            #
            # # Mel-post processing network parameters
            # "postnet_embedding_dim": 512,
            # "postnet_kernel_size": 5,
            # "postnet_n_convolutions": 5,
            #
            # "mask_padding": False,
        }

        # Fill missing values by default values.
        for key, value in default_hparams.items():
            if not hasattr(hparams, key):
                hparams.add_hparam(key, value)

        if verbose:
            logging.info('Final parsed hparams: %s', hparams.values())

        return hparams
Exemplo n.º 10
0
    def test_gen_figure(self):
        num_test_files = 2

        hparams = self._get_hparams()
        hparams.out_dir = os.path.join(
            hparams.out_dir, "test_gen_figure")  # Add function name to path
        hparams.model_name = "test_model_in409_out67.nn"
        hparams.model_path = os.path.join("integration", "fixtures",
                                          hparams.model_name)

        trainer = AcousticModelTrainer(self.dir_world_features,
                                       self.dir_question_labels, self.id_list,
                                       hparams.num_questions, hparams)
        trainer.init(hparams)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=UserWarning,
                                    module="matplotlib")
            trainer.gen_figure(hparams, self.id_list[:num_test_files])
        # Check number of created files.
        found_files = list([
            name for name in os.listdir(hparams.out_dir)
            if os.path.isfile(os.path.join(hparams.out_dir, name))
            and name.endswith(hparams.model_name + ".Org-PyTorch" +
                              hparams.gen_figure_ext)
        ])
        self.assertEqual(
            len(self.id_list[:num_test_files]),
            len(found_files),
            msg="Number of {} files in out_dir directory does not match.".
            format(hparams.gen_figure_ext))

        shutil.rmtree(hparams.out_dir)
Exemplo n.º 11
0
    def _get_acoustic_hparams(self):
        hparams = AcousticModelTrainer.create_hparams()
        # General parameters
        hparams.num_questions = 409
        # hparams.data_dir = os.path.realpath(os.path.join("integration", "fixtures", "database"))
        hparams.out_dir = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            type(self).__name__)

        hparams.frame_size_ms = 5
        hparams.num_coded_sps = 20
        hparams.seed = 1234

        # Training parameters.
        hparams.epochs = 0
        hparams.use_gpu = False
        hparams.model_type = "RNNDYN-1_RELU_32-1_FC_67"
        hparams.model_name = "AM.nn"

        return hparams
    def create_hparams(hparams_string=None, verbose=False):
        """Create model hyper parameters. Parse non default from given string."""

        hparams = AcousticModelTrainer.create_hparams(hparams_string,
                                                      verbose=False)
        hparams.add_hparams(
            f_get_emb_index=
            None,  # Computes an index from the id_name of a sample to index embedding vector.
            pre_net_model_type=
            None,  # Can be any type registered in ModelFactory.
            pre_net_model_name=
            None,  # Used to load a model when pre_net_model_type is None.
            pre_net_model_path=None,
            train_pre_net=True,
            pass_embs_to_pre_net=False,
            num_coded_sps=30,
            num_speakers=None,
            speaker_emb_dim=128)

        if verbose:
            logging.info(hparams.get_debug_string())

        return hparams
Exemplo n.º 13
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "AllPassWarpModelTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "VTLNArtificiallyWarped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "all_pass_warp_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps)
    hparams.alpha_ranges = [
        0.2,
    ]

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps,
                               num_bap=hparams.num_bap)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ),
                                           (hparams.num_coded_sps, ), hparams)
    all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"]
    id_list = ["p225/p225_051"]

    t_benchmark = 0
    for id_name in id_list:
        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps,
            num_bap=hparams.num_bap,
            sp_type=hparams.sp_type)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)].copy()
        coded_sps = coded_sps[:, None,
                              ...].repeat(batch_size,
                                          1)  # Copy data in batch dimension.

        for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)):
            out_dir = os.path.join(hparams.out_dir,
                                   "alpha_{0:0.2f}".format(alpha))
            makedirs_safe(out_dir)

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha
            alpha_vec = alpha_vec[:, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            sp_warped, (_, nn_alpha) = all_pass_warp_model(
                torch.from_numpy(coded_sps.copy()),
                None, (len(coded_sps), ), (len(coded_sps), ),
                alphas=torch.tensor(alpha_vec, requires_grad=True))
            sp_warped.sum().backward()
            t_benchmark += timer() - t_start
            # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all())  # Compare results for cloned coded_sps within batch.
            if np.isclose(alpha, 0):
                assert np.isclose(
                    sp_warped.detach().cpu().numpy(),
                    coded_sps).all()  # Compare no warping results.
            sample_pre[:len(sp_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre,
                                                    apply_mlpg=False)
            # Manually create samples without normalisation but with deltas.
            sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] +
                                      gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre_with_deltas).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre_with_deltas.tofile(
                os.path.join(out_dir,
                             id_name + "." + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap))
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}, average: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark),
        timedelta(seconds=t_benchmark) / (len(id_list) * idx)))
Exemplo n.º 14
0
def main():
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = "English"
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    # hparams.num_questions = 505
    hparams.num_questions = 425
    hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/"
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    batch_size = 2
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        "experiments/" + hparams.voice + "/WORLD",
        "experiments/" + hparams.voice + "/questions", "ignored",
        hparams.num_questions, hparams)

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    # id_list = ["dorian/doriangray_16_00199"]
    id_list = ["p225/p225_051"]
    hparams.num_speakers = 1

    t_benchmark = 0
    for id_name in id_list:
        for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)):
            out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha)
            makedirs_safe(out_dir)

            sample = WorldFeatLabelGen.load_sample(
                id_name,
                os.path.join("experiments", hparams.voice, "WORLD"),
                add_deltas=True,
                num_coded_sps=hparams.num_coded_sps)
            sample_pre = gen_in.preprocess_sample(sample)
            coded_sps = sample_pre[:, :hparams.num_coded_sps *
                                   (3 if hparams.add_deltas else 1)]

            alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha

            coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat(
                batch_size, 1)  # Copy data in batch dimension.
            alpha_vec = alpha_vec[:, None, None].repeat(
                batch_size, 1)  # Copy data in batch dimension.

            t_start = timer()
            mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                            None, (len(coded_sps), ),
                                            (len(coded_sps), ),
                                            alphas=torch.from_numpy(alpha_vec))
            mfcc_warped.sum().backward()
            t_benchmark += timer() - t_start
            assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()
                    )  # Compare results for cloned coded_sps within batch.
            if alpha == 0:
                assert ((mfcc_warped == coded_sps).all()
                        )  # Compare results for no warping.
            sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (
                3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach()

            sample_post = gen_in.postprocess_sample(sample_pre)
            # Manually create samples without normalisation but with deltas.
            sample_pre = (sample_pre * gen_in.norm_params[1] +
                          gen_in.norm_params[0]).astype(np.float32)

            if np.isnan(sample_pre).any():
                raise ValueError(
                    "Detected nan values in output features for {}.".format(
                        id_name))
            # Save warped features.
            makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name)))
            sample_pre.tofile(
                os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas))

            hparams.synth_dir = out_dir
            Synthesiser.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} runs: {}".format(
        len(id_list) * idx, timedelta(seconds=t_benchmark)))
Exemplo n.º 15
0
def main():
    """Create samples with artificial alpha for each phoneme."""
    from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer
    hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams()
    hparams.use_gpu = False
    hparams.voice = sys.argv[1]
    hparams.model_name = "WarpingLayerTest.nn"
    hparams.add_deltas = True
    hparams.num_coded_sps = 30
    alpha_range = 0.2
    num_phonemes = 70

    num_random_alphas = 7
    # num_random_alphas = 53

    # Randomly pick alphas for each phoneme.
    np.random.seed(42)
    # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range)
    phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 *
                                alpha_range)

    # hparams.num_questions = 505
    hparams.num_questions = 609
    # hparams.num_questions = 425

    hparams.out_dir = os.path.join("experiments", hparams.voice,
                                   "WORLD_artificially_warped")
    hparams.data_dir = os.path.realpath("database")
    hparams.model_name = "warping_layer_test"
    hparams.synth_dir = hparams.out_dir
    dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD")

    print(
        "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}."
        .format(hparams.voice, hparams.out_dir, hparams.num_questions,
                len(np.unique(phonemes_to_alpha_tensor)), alpha_range))

    from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen
    gen_in = WorldFeatLabelGen(dir_world_labels,
                               add_deltas=hparams.add_deltas,
                               num_coded_sps=hparams.num_coded_sps)
    gen_in.get_normalisation_params(gen_in.dir_labels)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    trainer = AcousticModelTrainer(
        os.path.join("experiments", hparams.voice, "WORLD"),
        os.path.join("experiments", hparams.voice, "questions"), "ignored",
        hparams.num_questions, hparams)

    hparams.num_speakers = 1
    speaker = "p276"
    num_synth_files = 5  # Number of files to synthesise to check warping manually.

    sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps *
                                    (3 if hparams.add_deltas else 1)]
    sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps *
                                       (3 if hparams.add_deltas else 1)]
    wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ),
                      hparams)
    wl.set_norm_params(sp_mean, sp_std_dev)

    def _question_to_phoneme_index(questions):
        """Helper function to convert questions to their current phoneme index."""
        if questions.shape[-1] == 505:  # German question set.
            indices = np.arange(86, 347, 5, dtype=np.int)
        elif questions.shape[-1] == 425:  # English radio question set.
            indices = np.arange(58, 107, dtype=np.int)
        elif questions.shape[-1] == 609:  # English unilex question set.
            indices = np.arange(92, 162, dtype=np.int)
        else:
            raise NotImplementedError(
                "Unknown question set with {} questions.".format(
                    questions.shape[-1]))
        return QuestionLabelGen.questions_to_phoneme_indices(
            questions, indices)

    # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f:
    with open(
            os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format(
                hparams.voice))) as f:
        id_list = f.readlines()
    id_list[:] = [s.strip(' \t\n\r') for s in id_list
                  if speaker in s]  # Trim line endings in-place.

    out_dir = hparams.out_dir
    makedirs_safe(out_dir)
    makedirs_safe(os.path.join(out_dir,
                               "cmp_mgc" + str(hparams.num_coded_sps)))
    t_benchmark = 0
    org_to_warped_mcd = 0.0
    for idx, id_name in enumerate(id_list):

        sample = WorldFeatLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "WORLD"),
            add_deltas=True,
            num_coded_sps=hparams.num_coded_sps)
        sample_pre = gen_in.preprocess_sample(sample)
        coded_sps = sample_pre[:, :hparams.num_coded_sps *
                               (3 if hparams.add_deltas else 1)]

        questions = QuestionLabelGen.load_sample(
            id_name,
            os.path.join("experiments", hparams.voice, "questions"),
            num_questions=hparams.num_questions)
        questions = questions[:len(coded_sps)]
        phoneme_indices = _question_to_phoneme_index(questions)
        alpha_vec = phonemes_to_alpha_tensor[phoneme_indices %
                                             len(phonemes_to_alpha_tensor),
                                             None]

        coded_sps = coded_sps[:len(alpha_vec), None,
                              ...]  # Create a batch dimension.
        alpha_vec = alpha_vec[:, None,
                              None]  # Create a batch and feature dimension.

        t_start = timer()
        mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps),
                                        None, (len(coded_sps), ),
                                        (len(coded_sps), ),
                                        alphas=torch.from_numpy(alpha_vec))
        t_benchmark += timer() - t_start
        sample_pre[:len(mfcc_warped), :hparams.num_coded_sps *
                   (3 if hparams.add_deltas else 1)] = mfcc_warped[:,
                                                                   0].detach()

        sample_post = gen_in.postprocess_sample(sample_pre)
        # Manually create samples without normalisation but with deltas.
        sample_pre = (sample_pre * gen_in.norm_params[1] +
                      gen_in.norm_params[0]).astype(np.float32)

        if np.isnan(sample_pre).any():
            raise ValueError(
                "Detected nan values in output features for {}.".format(
                    id_name))

        # Compute error between warped version and original one.
        org_to_warped_mcd += metrics.melcd(
            sample[:, 0:hparams.num_coded_sps],
            sample_pre[:, 0:hparams.num_coded_sps])

        # Save warped features.
        sample_pre.tofile(
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                os.path.basename(id_name + WorldFeatLabelGen.ext_deltas)))

        hparams.synth_dir = out_dir
        if idx < num_synth_files:  # Only synthesize a few of samples.
            trainer.run_world_synth({id_name: sample_post}, hparams)

    print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}".
          format(len(id_list), timedelta(seconds=t_benchmark),
                 org_to_warped_mcd / len(id_list)))

    # Copy normalisation files which are necessary for training.
    for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]:
        shutil.copyfile(
            os.path.join(
                gen_in.dir_labels, gen_in.dir_deltas,
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"),
            os.path.join(
                out_dir, "cmp_mgc" + str(hparams.num_coded_sps),
                MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
Exemplo n.º 16
0
    def run_DM_AM(hparams, input_strings):
        """
        A function for TTS with a pre-trained duration and acoustic model.

        :param hparams:            Hyper-parameter container. The following parameters are used:
                                   front_end:                    Full path to the makeLabels.sh script in scripts/tts_frontend, depends on the language.
                                   festival_dir:                 Full path to the directory with the festival bin/ folder.
                                   front_end_accent (optional):  Give an accent to the front_end, used in tts_frontend.
                                   duration_labels_dir:          Full path to the folder containing the normalisation parameters used to train the duration model.
                                   file_symbol_dict:             A file containing all the used phonemes (has been used to train the duration model, usually mono_phone.list).
                                   duration_model:               Full path to the pre-trained duration model.
                                   num_phoneme_states:           Number of states per phoneme, for each a duration is predicted by the duration model.
                                   question_file:               Full path to question file used to train the acoustic model.
                                   question_labels_norm_file:    Full path to normalisation file of questions used to train the acoustic model.
                                   num_questions:                Number of questions which form the input dimension to the acoustic model.
                                   acoustic_model:               Full path to acoustic model.
        :param input_strings:
        :return:
        """
        # Create a temporary directory to store all files.
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            # tmp_dir_name = os.path.realpath("TMP")
            # makedirs_safe(tmp_dir_name)
            hparams.out_dir = tmp_dir_name
            print("Created temporary directory", tmp_dir_name)
            id_list = ["synth" + str(idx) for idx in range(len(input_strings))]

            # Write the text to synthesise into a single synth.txt file with ids.
            utts_file = os.path.join(tmp_dir_name, "synth.txt")
            with open(utts_file, "w") as text_file:
                for idx, text in enumerate(input_strings):
                    text_file.write("synth{}\t{}\n".format(
                        idx, text))  # TODO: Remove parenthesis etc.

            # Call the front end on the synth.txt file.
            front_end_arguments = [
                hparams.front_end, hparams.festival_dir, utts_file
            ]
            if hasattr(hparams, "front_end_accent"
                       ) and hparams.front_end_accent is not None:
                front_end_arguments.append(hparams.front_end_accent)
            front_end_arguments.append(tmp_dir_name)
            subprocess.check_call(front_end_arguments)

            # Remove durations from mono labels.
            dir_mono_no_align = os.path.join(tmp_dir_name, "mono_no_align")
            dir_mono = os.path.join(tmp_dir_name, "labels", "mono")

            if os.path.isdir(dir_mono_no_align):
                shutil.rmtree(dir_mono_no_align)
            os.rename(dir_mono, dir_mono_no_align)
            for id_name in id_list:
                with open(os.path.join(dir_mono_no_align, id_name + ".lab"),
                          "r") as f:
                    old = f.read()
                    monophones = old.split()[2::3]
                with open(os.path.join(dir_mono_no_align, id_name + ".lab"),
                          "w") as f:
                    f.write("\n".join(monophones))

            # Run duration model.
            hparams.batch_size_test = len(input_strings)
            hparams.test_set_perc = 0.0
            hparams.val_set_perc = 0.0
            hparams.phoneme_label_type = "mono_no_align"
            hparams.output_norm_params_file_prefix = hparams.duration_norm_file_name if hasattr(
                hparams, "duration_norm_file_name") else None
            duration_model_trainer = DurationModelTrainer(
                os.path.join(tmp_dir_name,
                             "mono_no_align"), hparams.duration_labels_dir,
                id_list, hparams.file_symbol_dict, hparams)
            assert hparams.duration_model is not None, "Path to duration model in hparams.duration_model is needed."
            hparams.model_path = hparams.duration_model
            hparams.model_name = os.path.basename(hparams.duration_model)

            # Predict durations. Durations are already converted to multiples of hparams.min_phoneme_length.
            hparams.load_from_checkpoint = True
            duration_model_trainer.init(hparams)
            _, output_dict_post = duration_model_trainer.forward(
                hparams, id_list)
            hparams.output_norm_params_file_prefix = None  # Reset again.

            # Write duration to full labels.
            dir_full = os.path.join(tmp_dir_name, "labels", "full")
            dir_label_state_align = os.path.join(tmp_dir_name, "labels",
                                                 "label_state_align")
            makedirs_safe(dir_label_state_align)
            for id_name in id_list:
                with open(os.path.join(dir_full, id_name + ".lab"), "r") as f:
                    full = f.read().split()[2::3]
                with open(
                        os.path.join(dir_label_state_align, id_name + ".lab"),
                        "w") as f:
                    current_time = 0
                    timings = output_dict_post[id_name]
                    for idx, monophone in enumerate(full):
                        for state in range(hparams.num_phoneme_states):
                            next_time = current_time + int(timings[idx, state])
                            f.write("{}\t{}\t{}[{}]\n".format(
                                current_time, next_time, monophone, state + 2))
                            current_time = next_time

            # Generate questions from HTK full labels.
            QuestionLabelGen.gen_data(dir_label_state_align,
                                      hparams.question_file,
                                      dir_out=tmp_dir_name,
                                      file_id_list="synth",
                                      id_list=id_list,
                                      return_dict=False)

            # Run acoustic model and synthesise.
            shutil.copy2(hparams.question_labels_norm_file,
                         tmp_dir_name + "/min-max.bin"
                         )  # Get normalisation parameters in same directory.
            acoustic_model_trainer = AcousticModelTrainer(
                hparams.world_features_dir, tmp_dir_name, id_list,
                hparams.num_questions, hparams)
            assert hparams.acoustic_model is not None, "Path to acoustic model in hparams.acoustic_model is needed."
            hparams.model_path = hparams.acoustic_model
            hparams.model_name = os.path.basename(hparams.acoustic_model)
            hparams.load_from_checkpoint = True
            acoustic_model_trainer.init(hparams)
            hparams.model_name = ""  # No suffix in synthesised files.
            _, output_dict_post = acoustic_model_trainer.synth(
                hparams, id_list)

            logging.info("Synthesized files are in {}.".format(
                hparams.synth_dir))

        return 0
Exemplo n.º 17
0
    hparams.complex_poles = False
    hparams.thetas = thetas
    hparams.hparams_flat = copy.deepcopy(hparams)
    hparams.hparams_atom = copy.deepcopy(hparams)
    hparams.hparams_flat.hparams_atom = hparams.hparams_atom
    hparams.flat_model_path = "neural_filters_model_in409_out2.nn"
    hparams.atom_model_path = "test_model_in409_out7.nn"
    hparams.hparams_atom.model_path = hparams.atom_model_path
    hparams.hparams_flat.model_path = hparams.flat_model_path
    hparams.hparams_flat.atom_model_path = hparams.atom_model_path
    model_handler = ModelHandlerPyTorch()
    epochs = model_handler.load_checkpoint(hparams.model_name, hparams)
    model_handler.save_checkpoint(os.path.realpath(hparams.model_name), epochs)

    from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer
    hparams = AcousticModelTrainer.create_hparams()
    hparams.model_name = "test_model_in409_out67.nn"
    model_handler = ModelHandlerPyTorch()
    epochs = model_handler.load_checkpoint(hparams.model_name, hparams)
    model_handler.save_checkpoint(os.path.realpath(hparams.model_name), epochs)

    from idiaptts.src.model_trainers.WaveNetVocoderTrainer import WaveNetVocoderTrainer
    hparams = WaveNetVocoderTrainer.create_hparams()
    hparams.model_name = "r9y9_wavenet_in23_out128.nn"
    hparams.num_coded_sps = 20
    hparams.input_type = "mulaw-quantize"
    hparams.quantize_channels = 128
    hparams.mu = 127
    hparams.out_channels = hparams.quantize_channels
    hparams.cin_channels = hparams.num_coded_sps + 3
    hparams.upsample_conditional_features = True