예제 #1
0
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [self.config.text_cleaner])

        self.model = Tacotron(self.input_size, config.embedding_size,
                              self.ap.num_freq, self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file,
                            map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()
예제 #2
0
파일: eval.py 프로젝트: geneing/TTS
 def load_model(self, model_path, model_config, wavernn_path, use_cuda):
     
     self.model_file = model_path
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.use_phonemes = config.use_phonemes
     self.ap = AudioProcessor(**config.audio)
     
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
     
     self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True)
     self.model.decoder.max_decoder_steps = 8000
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
     self.vocoder=WaveRNNVocoder.Vocoder()
     self.vocoder.loadWeights(wavernn_path)
     self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
예제 #3
0
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner])
     self.tts_model = setup_model(self.input_size, self.tts_config)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
예제 #4
0
    def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
        sys.path.append(lib_path) # set this if TTS is not installed globally
        from WaveRNN.models.wavernn import Model
        wavernn_config = os.path.join(model_path, model_config)
        model_file = os.path.join(model_path, model_file)
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", wavernn_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(wavernn_config)
        self.wavernn = Model(
                rnn_dims=512,
                fc_dims=512,
                mode=self.wavernn_config.mode,
                pad=2,
                upsample_factors=self.wavernn_config.upsample_factors,  # set this depending on dataset
                feat_dims=80,
                compute_dims=128,
                res_out_dims=128,
                res_blocks=10,
                hop_length=self.ap.hop_length,
                sample_rate=self.ap.sample_rate,
            ).cuda()

        check = torch.load(model_file)
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
예제 #5
0
    def load_wavernn(self, lib_path, model_path, model_file, model_config,
                     use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(lib_path)  # set this if TTS is not installed globally
        from WaveRNN.models.wavernn import Model
        wavernn_config = os.path.join(model_path, model_config)
        model_file = os.path.join(model_path, model_file)
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", wavernn_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(wavernn_config)
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file)
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
예제 #6
0
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
     else:
         self.input_size = len(symbols)
     # load speakers
     if self.config.tts_speakers is not None:
         self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers))
         num_speakers = len(self.tts_speakers)
     else:
         num_speakers = 0
     self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) 
     # load model state
     cp = torch.load(self.model_file)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
     if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
         self.tts_model.decoder.set_r(cp['r'])
예제 #7
0
def main():
    """
    Call train.py as a new process and pass command arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--restore_path',
                        type=str,
                        help='Folder path to checkpoints',
                        default='')
    parser.add_argument(
        '--config_path',
        type=str,
        help='path to config file for training',
    )
    parser.add_argument('--data_path',
                        type=str,
                        help='dataset path.',
                        default='')

    args = parser.parse_args()

    CONFIG = load_config(args.config_path)
    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
                                        True)
    stdout_path = os.path.join(OUT_PATH, "process_stdout/")

    num_gpus = torch.cuda.device_count()
    group_id = time.strftime("%Y_%m_%d-%H%M%S")

    # set arguments for train.py
    command = ['train.py']
    command.append('--restore_path={}'.format(args.restore_path))
    command.append('--config_path={}'.format(args.config_path))
    command.append('--group_id=group_{}'.format(group_id))
    command.append('--data_path={}'.format(args.data_path))
    command.append('--output_path={}'.format(OUT_PATH))
    command.append('')

    if not os.path.isdir(stdout_path):
        os.makedirs(stdout_path)
        os.chmod(stdout_path, 0o775)

    # run processes
    processes = []
    for i in range(num_gpus):
        my_env = os.environ.copy()
        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
        command[6] = '--rank={}'.format(i)
        stdout = None if i == 0 else open(
            os.path.join(stdout_path, "process_{}.log".format(i)), "w")
        p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
        processes.append(p)
        print(command)

    for p in processes:
        p.wait()
예제 #8
0
def main(args):
    """
    Call train.py as a new process and pass command arguments
    """
    CONFIG = load_config(args.config_path)
    if args.output_path == "":
        OUT_PATH = os.path.join(_, CONFIG.output_path)
    else:
        OUT_PATH = args.output_path
    OUT_PATH = create_experiment_folder(OUT_PATH, CONFIG.model_name)
    stdout_path = os.path.join(OUT_PATH, "process_stdout/")

    num_gpus = torch.cuda.device_count()
    group_id = time.strftime("%Y_%m_%d-%H%M%S")

    if args.lr_find:
        command = ['find_lr.py']
        command.append('--restore_path={}'.format(args.restore_path))
        command.append('--config_path={}'.format(args.config_path))
        command.append('--group_id=group_{}'.format(group_id))
        command.append('--data_path={}'.format(args.data_path))
        command.append('--output_path={}'.format(OUT_PATH))
        command.append('--init_lr={}'.format(args.init_lr))
        command.append('--end_lr={}'.format(args.end_lr))
        command.append('')
    else:
        # set arguments for train.py
        command = ['train.py']
        command.append('--restore_path={}'.format(args.restore_path))
        command.append('--config_path={}'.format(args.config_path))
        command.append('--group_id=group_{}'.format(group_id))
        command.append('--data_path={}'.format(args.data_path))
        command.append('--output_path={}'.format(OUT_PATH))
        command.append('')

    if not os.path.isdir(stdout_path):
        os.makedirs(stdout_path)
        os.chmod(stdout_path, 0o775)

    # run processes
    processes = []
    for i in range(num_gpus):
        my_env = os.environ.copy()
        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
        command[6] = '--rank={}'.format(i)
        stdout = None if i == 0 else open(
            os.path.join(stdout_path, "process_{}.log".format(i)), "w")
        p = subprocess.Popen(['python3'.format(i)] + command,
                             stdout=stdout,
                             env=my_env)
        processes.append(p)
        print(command)

    for p in processes:
        p.wait()
예제 #9
0
    def load_model(self, model_path, model_name, model_config, use_cuda):

        #build the config's path
        model_config = os.path.join(model_path, model_config)

        #build the model's path
        model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > Model config path: ", model_config)
        print(" | > Model file path: ", model_file)

        config = load_config(model_config)
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [config.text_cleaner], config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [config.text_cleaner])

        self.model = Tacotron(num_chars=config['num_chars'],
                              embedding_dim=config['embedding_size'],
                              linear_dim=self.ap.num_freq,
                              mel_dim=self.ap.num_mels,
                              r=config['r'])

        #load model state
        if use_cuda:
            cp = torch.load(model_file)
        else:
            cp = torch.load(model_file,
                            map_location=lambda storage, loc: storage)

        #load the model
        self.model.load_state_dict(cp['model'])

        #if cuda is enabled & available move tensors to GPU
        if use_cuda:
            self.model.cuda()

        #disables normalization techniques present in code
        self.model.eval()
 def load_model(self, model_path, model_name, model_config, use_cuda):
     model_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_name)
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.ap = AudioProcessor(**config.audio)
     self.model = Tacotron(61, config.embedding_size, self.ap.num_freq,
                           self.ap.num_mels, config.r)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file,
                         map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
예제 #11
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
     synthesizer = Synthesizer(config)
     synthesizer.tts("Better this test works!!")
예제 #12
0
import os
import unittest

from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from utils.audio import AudioProcessor
from utils.generic_utils import load_config

TESTS_PATH = get_tests_path()
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))


class TestAudio(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestAudio, self).__init__(*args, **kwargs)
        self.ap = AudioProcessor(**conf.audio)

    def test_audio_synthesis(self):
        """ 1. load wav
            2. set normalization parameters
            3. extract mel-spec
            4. invert to wav and save the output
        """
        print(" > Sanity check for the process wav -> mel -> wav")

        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
            self.ap.max_norm = max_norm
            self.ap.signal_norm = signal_norm
        '--librispeech',
        type=str,
        required=False,
        default=False,
        help="Librispeech format, if true load with librispeech format")
    args = parser.parse_args()

    os.makedirs(args.out_dir, exist_ok=True)
    if args.train_data_csv:
        os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True)
    if args.test_data_csv:
        os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True)

    cpu_num = cpu_count()  # num threads = num cpu cores

    config = load_config(args.config)
    ap = AudioProcessor(config.audio)

    sample_rate = config.audio[config.audio['backend']]['sample_rate']
    audio_len = config.audio['audio_len']
    form = config.dataset['format']
    output_dir_train = os.path.join(args.out_dir, 'train')
    output_dir_test = os.path.join(args.out_dir, 'test')

    dataset_root_dir = args.dataset_root_dir
    train_data_csv = None
    test_data_csv = None

    noise_files = open(args.noise_csv).readlines()

    if args.train_data_csv:
예제 #14
0
        help='Path to input text file.',
    )
    parser.add_argument(
        '--output',
        type=str,
        help='Path to save final wav file.',
    )

    args = parser.parse_args()

    try:
        path = os.path.realpath(os.path.dirname(__file__))
    except NameError as e:
        path = './'

    C = load_config(os.path.join(path, 'pretrained_models/TTS/config.json'))
    C.forward_attn_mask = False
    C.windowing = True
    # load the audio processor
    ap = AudioProcessor(**C.audio)
    num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
    cp = torch.load(os.path.join(path,
                                 'pretrained_models/TTS/best_model.pth.tar'),
                    map_location='cpu')
    model.load_state_dict(cp['model'], strict=False)
    model.r = cp['r']
    model.decoder.r = cp['r']
예제 #15
0
def run_test_all_seeds(args, cuda=True, debug=False, return_potential=False):
    runs_list = os.listdir(args.experiment_dir)
    runs_list.sort()
    num_runs = len(runs_list)

    votes = []
    wav_files = []
    targets = []
    # define loss function
    criterion = nn.BCELoss(reduction='sum')
    for run in runs_list:
        blockPrint()
        run_dir = os.path.join(args.experiment_dir, run)
        if os.path.isfile(run_dir):
            continue
        model_name = os.listdir(run_dir)[0]
        checkpoint_path = os.path.join(run_dir, model_name,
                                       'best_checkpoint.pt')
        config_path = os.path.join(run_dir, model_name, 'config.json')

        c = load_config(config_path)
        ap = AudioProcessor(**c.audio)

        c.dataset['test_csv'] = args.test_csv
        c.dataset['test_data_root_path'] = args.test_root_dir

        c.test_config['batch_size'] = args.batch_size
        c.test_config['num_workers'] = args.num_workers
        max_seq_len = c.dataset['max_seq_len']

        c.train_config['seed'] = 0

        testdataloader = test_dataloader(c, ap, max_seq_len=max_seq_len)

        # load model
        model = return_model(c)
        enablePrint()
        if checkpoint_path is not None:
            print("Loading checkpoint: %s" % checkpoint_path)
            try:
                checkpoint = torch.load(checkpoint_path, map_location='cpu')
                model.load_state_dict(checkpoint['model'])
                print("Model Sucessful Load !")
            except Exception as e:
                raise ValueError(
                    "You need pass a valid checkpoint, may be you need check your config.json because de the of this checkpoint cause the error: "
                    + e)
        blockPrint()
        # convert model from cuda
        if cuda:
            model = model.cuda()

        model.train(False)

        vote, targets, wav_path = test(criterion,
                                       ap,
                                       model,
                                       c,
                                       testdataloader,
                                       cuda=cuda,
                                       confusion_matrix=True,
                                       debug=debug,
                                       simples_vote=args.simples_vote)
        # print(vote)
        wav_files.append(wav_path)
        votes.append(vote)
        if len(wav_files):
            if wav_files[-1] != wav_files[0]:
                raise ValueError(
                    "Diferents files  or order for the test in diferrents seeds or folds"
                )

    # mean vote, and round is necessary if use composite vote
    preds = np.mean(np.array(votes), axis=0)
    # print(preds)
    if not return_potential:
        preds = preds.round()

    file_names = wav_files[0]

    if debug and not return_potential:
        enablePrint()
        targets = np.array(targets)
        preds = np.array(preds)
        names = np.array(file_names)
        idxs = np.nonzero(targets == c.dataset['control_class'])
        control_target = targets[idxs]
        control_preds = preds[idxs]
        names_control = names[idxs]

        idxs = np.nonzero(targets == c.dataset['patient_class'])

        patient_target = targets[idxs]
        patient_preds = preds[idxs]
        names_patient = names[idxs]

        if debug:
            print('+' * 40)
            print("Control Files Classified incorrectly:")
            incorrect_ids = np.nonzero(
                control_preds != c.dataset['control_class'])
            inc_names = names_control[incorrect_ids]
            print("Num. Files:", len(inc_names))
            print(inc_names)
            print('+' * 40)
            print('-' * 40)
            print("Patient Files Classified incorrectly:")
            incorrect_ids = np.nonzero(
                patient_preds != c.dataset['patient_class'])
            inc_names = names_patient[incorrect_ids]
            print("Num. Files:", len(inc_names))
            print(inc_names)
            print('-' * 40)

        acc_control = (control_preds == control_target).mean()
        acc_patient = (patient_preds == patient_target).mean()
        acc_balanced = (acc_control + acc_patient) / 2

        f1 = f1_score(targets.tolist(), preds.tolist())
        uar = recall_score(targets.tolist(), preds.tolist(), average='macro')
        print("======== Confusion Matrix ==========")
        y_target = pd.Series(targets, name='Target')
        y_pred = pd.Series(preds, name='Predicted')
        df_confusion = pd.crosstab(y_target,
                                   y_pred,
                                   rownames=['Target'],
                                   colnames=['Predicted'],
                                   margins=True)
        print(df_confusion)

        print("Test\n ", "Acurracy Control: ", acc_control,
              "Acurracy Patient: ", acc_patient, "Acurracy Balanced",
              acc_balanced)
        print("F1:", f1, "UAR:", uar)

    if return_potential:
        return preds, file_names
    else:
        df = pd.DataFrame({
            'filename': file_names,
            'prediction': preds.astype(int)
        })
        df['prediction'] = df['prediction'].replace(
            int(c.dataset['control_class']), 'negative',
            regex=True).replace(int(c.dataset['patient_class']),
                                'positive',
                                regex=True)
        if args.output_csv:
            out_csv_path = args.output_csv
        else:
            out_csv_path = os.path.join(
                args.experiment_dir, os.path.basename(c.dataset['test_csv']))

        df.to_csv(out_csv_path, index=False)
예제 #16
0
파일: train.py 프로젝트: marzus555/WaveRNN
    parser.add_argument(
        "--output_path", type=str, help="path for training outputs.", default=""
    )
    # DISTRUBUTED
    parser.add_argument(
        "--rank",
        type=int,
        default=0,
        help="DISTRIBUTED: process rank for distributed training.",
    )
    parser.add_argument(
        "--group_id", type=str, default="", help="DISTRIBUTED: process group id."
    )

    args = parser.parse_args()
    CONFIG = load_config(args.config_path)

    if args.data_path != "":
        CONFIG.data_path = args.data_path
    DATA_PATH = CONFIG.data_path

    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(
            args.rank,
            num_gpus,
            args.group_id,
            CONFIG.distributed["backend"],
            CONFIG.distributed["url"],
        )
예제 #17
0
파일: train.py 프로젝트: JackInTaiwan/TTS
    parser.add_argument('--reset_lr', action="store_true", help='reset lr.')

    # DISTRUBUTED
    parser.add_argument(
        '--rank',
        type=int,
        default=0,
        help='DISTRIBUTED: process rank for distributed training.')
    parser.add_argument('--group_id',
                        type=str,
                        default="",
                        help='DISTRIBUTED: process group id.')
    args = parser.parse_args()

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    if args.data_path != '':
        c.data_path = args.data_path

    if args.output_path == '':
        OUT_PATH = os.path.join(_, c.output_path)
    else:
        OUT_PATH = args.output_path

    if args.group_id == '' and args.output_folder == '':
        OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
    else:
        OUT_PATH = os.path.join(OUT_PATH, args.output_folder)

    AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
예제 #18
0
    MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'

    if args.step is None:
        MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'
    else:
        MODEL_PATH = MODEL_PATH_TMP.format(args.step)

    print(MODEL_PATH)
    CONFIG_PATH = ROOT_PATH + '/config.json'
    OUT_FOLDER = ROOT_PATH + '/test/'
    duration_folder = os.path.join(OUT_FOLDER, 'durations')
    plot_folder = os.path.join(OUT_FOLDER, 'plot')
    os.makedirs(duration_folder, exist_ok=True)
    os.makedirs(plot_folder, exist_ok=True)

    c = load_config(CONFIG_PATH)
    ap = AudioProcessor(**c.audio)
    use_cuda = True
    
    data_loader = setup_loader(c, is_val=False)
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
    
    model = Tacotron(
        num_chars=num_chars,
        embedding_dim=c.embedding_size,
        linear_dim=ap.num_freq,
        mel_dim=ap.num_mels,
        r=c.r,
        memory_size=c.memory_size)

    checkpoint = torch.load(MODEL_PATH)
예제 #19
0
 def __init__(self, *args, **kwargs):
     super(TestTTSDatasetCached, self).__init__(*args, **kwargs)
     self.max_loader_iter = 4
     self.c = load_config(os.path.join(c.data_path_cache, 'config.json'))
     self.ap = AudioProcessor(**self.c.audio)
예제 #20
0
    parser.add_argument("--val_split",
                        type=int,
                        default=0,
                        help="Number of instances for validation.")
    parser.add_argument("--meta_file",
                        type=str,
                        help="Meta data file to be used for the dataset.")
    parser.add_argument("--process_audio",
                        type=bool,
                        default=False,
                        help="Preprocess audio files.")
    args = parser.parse_args()

    DATA_PATH = args.data_path
    CACHE_PATH = args.cache_path
    CONFIG = load_config(args.config)

    # load the right preprocessor
    preprocessor = importlib.import_module('datasets.preprocess')
    preprocessor = getattr(preprocessor, args.dataset.lower())
    items = preprocessor(args.data_path, args.meta_file)

    print(" > Input path: ", DATA_PATH)
    print(" > Cache path: ", CACHE_PATH)

    ap = AudioProcessor(**CONFIG.audio)

    def extract_mel(item):
        """ Compute spectrograms, length information """
        text = item[0]
        file_path = item[1]
예제 #21
0
                        type=str,
                        default='',
                        help="data path to overwrite config.json.")
    parser.add_argument("--out_path",
                        type=str,
                        default='',
                        help="destination to write files.")
    parser.add_argument("--ignore_errors",
                        type=bool,
                        default=False,
                        help="ignore bad files.")

    args = parser.parse_args()

    config_path = args.config_path
    CONFIG = load_config(config_path)

    if args.data_path != '':
        CONFIG.data_path = args.data_path

    if type(CONFIG.mode) is int:
        CONFIG.audio['bits'] = CONFIG.mode
    ap = AudioProcessor(**CONFIG.audio)

    SEG_PATH = CONFIG.data_path
    # OUT_PATH = os.path.join(args.out_path, CONFIG.run_name, "data/")
    OUT_PATH = args.out_path
    QUANT_PATH = os.path.join(OUT_PATH, "quant/")
    MEL_PATH = os.path.join(OUT_PATH, "mel/")
    os.makedirs(OUT_PATH, exist_ok=True)
    os.makedirs(QUANT_PATH, exist_ok=True)
예제 #22
0
import os
import glob
import tqdm
import torch
import random
import librosa
import argparse
import numpy as np
from multiprocessing import Pool, cpu_count

from utils.audio_processor import WrapperAudioProcessor as AudioProcessor
from utils.generic_utils import mix_wavfiles
from utils.generic_utils import load_config
import pandas as pd

import librosa

config = load_config('config.json')
ap = AudioProcessor(config.audio)

data_path = '../test-my-data-prepo/train/'
files = os.listdir(data_path)
for file_name in files:
    if '.pt' in file_name:
        spec = ap.inv_spectrogram(
            torch.load(os.path.join(data_path,
                                    file_name)).cpu().detach().numpy())
        ap.save_wav(spec, os.path.join(data_path, file_name + '.wav'))
예제 #23
0
 def _create_random_model(self):
     config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
     num_chars = len(phonemes) if config.use_phonemes else len(symbols)
     model = setup_model(num_chars, 0, config)
     output_path = os.path.join(get_tests_output_path())
     save_checkpoint(model, None, None, None, output_path, 10, 10)
예제 #24
0
#!flask/bin/python
import argparse
import torch
from synthesizer import Synthesizer
from utils.generic_utils import load_config
from flask import Flask, Response, request, render_template, send_file

parser = argparse.ArgumentParser()
parser.add_argument('-c',
                    '--config_path',
                    type=str,
                    help='path to config file for training')
args = parser.parse_args()

config = load_config('server/config.json')
app = Flask(__name__)
synthesizer = Synthesizer()
synthesizer.load_model(config.model_path, config.model_name,
                       config.model_config, config.use_cuda)


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/api/tts', methods=['GET'])
def tts():
    text = request.args.get('text')
    synthesizer.ap.frame_shift_ms = int(request.args.get('shift'))
    synthesizer.ap.griffin_lim_iters = int(request.args.get('iter'))
예제 #25
0
import torch
import unittest
import numpy as np

from torch import optim
from torch import nn
from utils.generic_utils import load_config
from layers.losses import L1LossMasked
from models.tacotron import Tacotron

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

file_path = os.path.dirname(os.path.realpath(__file__))
c = load_config(os.path.join(file_path, 'test_config.json'))


class TacotronTrainTest(unittest.TestCase):
    def test_train_step(self):
        input = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0
예제 #26
0
파일: synthesize.py 프로젝트: geneing/TTS
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default="")
    parser.add_argument(
        '--text_gst_prediction',
        type=bool,
        default=True,
        help='Predict style from the text itself for more dynamic speech.')

    args = parser.parse_args()

    if args.vocoder_path != "":
        from WaveRNN.models.wavernn import Model as VocoderModel

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # load speakers
    if args.speakers_json != '':
        speakers = json.load(open(args.speakers_json, 'r'))
        num_speakers = len(speakers)
    else:
        num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
예제 #27
0
파일: train.py 프로젝트: Vproject/TTS
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # save config to tmp place to be loaded by subsequent modules.
    file_name = str(os.getpid())
    tmp_path = os.path.join("/tmp/", file_name+'_tts')
    pickle.dump(c, open(tmp_path, "wb"))

    # setup tensorboard
    LOG_DIR = OUT_PATH
    tb = SummaryWriter(LOG_DIR)

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(1)
    signal.signal(signal.SIGINT, signal_handler)

    # Setup the dataset
    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    dataloader = DataLoader(dataset, batch_size=c.batch_size,
                            shuffle=True, collate_fn=dataset.collate_fn,
                            drop_last=True, num_workers=c.num_loader_workers)

    # setup the model
    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r)

    # plot model on tensorboard
    dummy_input = dataset.get_dummy_data()

    ## TODO: onnx does not support RNN fully yet
    # model_proto_path = os.path.join(OUT_PATH, "model.proto")
    # onnx.export(model, dummy_input, model_proto_path, verbose=True)
    # tb.add_graph_onnx(model_proto_path)

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    if args.restore_step:
        checkpoint = torch.load(os.path.join(
            args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)
        start_epoch = checkpoint['step'] // len(dataloader)
        best_loss = checkpoint['linear_loss']
    else:
        start_epoch = 0
        print("\n > Starting a new training")

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
    #                               patience=c.lr_patience, verbose=True)
    epoch_time = 0
    best_loss = float('inf')
    for epoch in range(0, c.epochs):

        print("\n | > Epoch {}/{}".format(epoch, c.epochs))
        progbar = Progbar(len(dataset) / c.batch_size)

        for num_iter, data in enumerate(dataloader):
            start_time = time.time()

            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]

            current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1

            # setup lr
            current_lr = lr_decay(c.lr, current_step)
            for params_group in optimizer.param_groups:
                params_group['lr'] = current_lr

            optimizer.zero_grad()

            # Add a single frame of zeros to Mel Specs for better end detection
            #try:
            #    mel_input = np.concatenate((np.zeros(
            #        [c.batch_size, 1, c.num_mels], dtype=np.float32),
            #        mel_input[:, 1:, :]), axis=1)
            #except:
            #    raise TypeError("not same dimension")

            # convert inputs to variables
            text_input_var = Variable(text_input)
            mel_spec_var = Variable(mel_input)
            linear_spec_var = Variable(linear_input, volatile=True)

            # sort sequence by length.
            # TODO: might be unnecessary
            sorted_lengths, indices = torch.sort(
                     text_lengths.view(-1), dim=0, descending=True)
            sorted_lengths = sorted_lengths.long().numpy()

            text_input_var = text_input_var[indices]
            mel_spec_var = mel_spec_var[indices]
            linear_spec_var = linear_spec_var[indices]

            if use_cuda:
                text_input_var = text_input_var.cuda()
                mel_spec_var = mel_spec_var.cuda()
                linear_spec_var = linear_spec_var.cuda()

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_spec_var,
                              input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))

            mel_loss = criterion(mel_output, mel_spec_var)
            #linear_loss = torch.abs(linear_output - linear_spec_var)
            #linear_loss = 0.5 * \
                #torch.mean(linear_loss) + 0.5 * \
                #torch.mean(linear_loss[:, :n_priority_freq, :])
            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
            # loss = loss.cuda()

            loss.backward()
            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
            optimizer.step()

            step_time = time.time() - start_time
            epoch_time += step_time

            progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
                                       ('linear_loss', linear_loss.data[0]),
                                       ('mel_loss', mel_loss.data[0]),
                                       ('grad_norm', grad_norm)])

            # Plot Learning Stats
            tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
                          current_step)
            tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
            tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                          current_step)
            tb.add_scalar('Params/GradNorm', grad_norm, current_step)
            tb.add_scalar('Time/StepTime', step_time, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Attn/Alignment', align_img, current_step)

            if current_step % c.save_step == 0:

                if c.checkpoint:
                    # save model
                    save_checkpoint(model, optimizer, linear_loss.data[0],
                                    OUT_PATH, current_step, epoch)

                # Diagnostic visualizations
                const_spec = linear_output[0].data.cpu().numpy()
                gt_spec = linear_spec_var[0].data.cpu().numpy()

                const_spec = plot_spectrogram(const_spec, dataset.ap)
                gt_spec = plot_spectrogram(gt_spec, dataset.ap)
                tb.add_image('Spec/Reconstruction', const_spec, current_step)
                tb.add_image('Spec/GroundTruth', gt_spec, current_step)

                align_img = alignments[0].data.cpu().numpy()
                align_img = plot_alignment(align_img)
                tb.add_image('Attn/Alignment', align_img, current_step)

                # Sample audio
                audio_signal = linear_output[0].data.cpu().numpy()
                dataset.ap.griffin_lim_iters = 60
                audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
                try:
                    tb.add_audio('SampleAudio', audio_signal, current_step,
                                 sample_rate=c.sample_rate)
                except:
                    print("\n > Error at audio signal on TB!!")
                    print(audio_signal.max())
                    print(audio_signal.min())


        # average loss after the epoch
        avg_epoch_loss = np.mean(
            progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
        best_loss = save_best_model(model, optimizer, avg_epoch_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

        #lr_scheduler.step(loss.data[0])
        tb.add_scalar('Time/EpochTime', epoch_time, epoch)
        epoch_time = 0
예제 #28
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'), c.r,
                              c.sample_rate, c.text_cleaner)

    model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq,
                     c.r)
    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    try:
        checkpoint = torch.load(
            os.path.join(CHECKPOINT_PATH,
                         'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)

    except:
        print("\n > Starting a new training\n")

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    for epoch in range(c.epochs):

        dataloader = DataLoader(dataset,
                                batch_size=c.batch_size,
                                shuffle=True,
                                collate_fn=dataset.collate_fn,
                                drop_last=True,
                                num_workers=32)
        progbar = Progbar(len(dataset) / c.batch_size)

        for i, data in enumerate(dataloader):
            text_input = data[0]
            magnitude_input = data[1]
            mel_input = data[2]

            current_step = i + args.restore_step + epoch * len(dataloader) + 1

            optimizer.zero_grad()

            try:
                mel_input = np.concatenate(
                    (np.zeros([c.batch_size, 1, c.num_mels],
                              dtype=np.float32), mel_input[:, 1:, :]),
                    axis=1)
            except:
                raise TypeError("not same dimension")

            if use_cuda:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.cuda.LongTensor),
                                          requires_grad=False).cuda()
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                         requires_grad=False).cuda()
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                        requires_grad=False).cuda()
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(
                        torch.cuda.FloatTensor),
                    requires_grad=False).cuda()

            else:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.LongTensor),
                                          requires_grad=False)
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                         requires_grad=False)
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                        requires_grad=False)
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(torch.FloatTensor),
                    requires_grad=False)

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_input_var)

            mel_loss = criterion(mel_output, mel_spec_var)
            linear_loss = torch.abs(linear_output - linear_spec_var)
            linear_loss = 0.5 * \
                torch.mean(linear_loss) + 0.5 * \
                torch.mean(linear_loss[:, :n_priority_freq, :])
            loss = mel_loss + linear_loss
            loss = loss.cuda()

            start_time = time.time()

            loss.backward()

            nn.utils.clip_grad_norm(model.parameters(), 1.)

            optimizer.step()

            time_per_step = time.time() - start_time
            progbar.update(i,
                           values=[('total_loss', loss.data[0]),
                                   ('linear_loss', linear_loss.data[0]),
                                   ('mel_loss', mel_loss.data[0])])

            if current_step % c.save_step == 0:
                checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
                checkpoint_path = os.path.join(OUT_PATH, checkpoint_path)
                save_checkpoint(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'step': current_step,
                        'total_loss': loss.data[0],
                        'linear_loss': linear_loss.data[0],
                        'mel_loss': mel_loss.data[0],
                        'date': datetime.date.today().strftime("%B %d, %Y")
                    }, checkpoint_path)
                print(" > Checkpoint is saved : {}".format(checkpoint_path))

            if current_step in c.decay_step:
                optimizer = adjust_learning_rate(optimizer, current_step)