示例#1
0
def phoneme_to_sequence(text,
                        cleaner_names,
                        language,
                        enable_eos_bos=False,
                        tp=None):
    # pylint: disable=global-statement
    global _phonemes_to_id
    if tp:
        _, _phonemes = make_symbols(**tp)
        _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}

    sequence = []
    text = text.replace(":", "")
    clean_text = _clean_text(text, cleaner_names)
    to_phonemes = text2phone(clean_text, language)
    if to_phonemes is None:
        print("!! After phoneme conversion the result is None. -- {} ".format(
            clean_text))
    # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
    for phoneme in filter(None, to_phonemes.split('|')):
        sequence += _phoneme_to_sequence(phoneme)
    # Append EOS char
    if enable_eos_bos:
        sequence = pad_with_eos_bos(sequence, tp=tp)
    return sequence
示例#2
0
def text_to_sequence(text, cleaner_names, tp=None):
    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.

      The text can optionally have ARPAbet sequences enclosed in curly braces embedded
      in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

      Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through

      Returns:
        List of integers corresponding to the symbols in the text
    '''
    # pylint: disable=global-statement
    global _symbol_to_id
    if tp:
        _symbols, _ = make_symbols(**tp)
        _symbol_to_id = {s: i for i, s in enumerate(_symbols)}

    sequence = []
    # Check for curly braces and treat their contents as ARPAbet:
    while text:
        m = _CURLY_RE.match(text)
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
        sequence += _symbols_to_sequence(_clean_text(m.group(1),
                                                     cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)
    return sequence
示例#3
0
文件: __init__.py 项目: rvirgilli/TTS
def pad_with_eos_bos(phoneme_sequence, tp=None):
    global _PHONEMES_TO_ID, _bos, _eos
    if tp:
        _bos = tp['bos']
        _eos = tp['eos']
        _, phonemes = make_symbols(**tp)
        _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
        
    return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
示例#4
0
    def _create_random_model(self):
        config = load_config(
            os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'text' in config.keys():
            symbols, phonemes = make_symbols(**config.text)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, None, None, output_path, 10, 10)
示例#5
0
def pad_with_eos_bos(phoneme_sequence, tp=None):
    # pylint: disable=global-statement
    global _phonemes_to_id, _bos, _eos
    if tp:
        _bos = tp['bos']
        _eos = tp['eos']
        _, _phonemes = make_symbols(**tp)
        _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}

    return [_phonemes_to_id[_bos]
            ] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
示例#6
0
    def _create_random_model(self):
        # pylint: disable=global-statement
        global symbols, phonemes
        config = load_config(
            os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
        if 'characters' in config.keys():
            symbols, phonemes = make_symbols(**config.characters)

        num_chars = len(phonemes) if config.use_phonemes else len(symbols)
        model = setup_model(num_chars, 0, config)
        output_path = os.path.join(get_tests_output_path())
        save_checkpoint(model, None, None, None, output_path, 10, 10)
示例#7
0
文件: __init__.py 项目: rvirgilli/TTS
def sequence_to_phoneme(sequence, tp=None):
    '''Converts a sequence of IDs back to a string'''
    global _ID_TO_PHONEMES
    if tp:
        _, phonemes =  make_symbols(**tp)
        _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
        
    for symbol_id in sequence:
        if symbol_id in _ID_TO_PHONEMES:
            s = _ID_TO_PHONEMES[symbol_id]
            result += s
    return result.replace('}{', ' ')
示例#8
0
def sequence_to_phoneme(sequence, tp=None):
    # pylint: disable=global-statement
    '''Converts a sequence of IDs back to a string'''
    global _id_to_phonemes
    result = ''
    if tp:
        _, _phonemes = make_symbols(**tp)
        _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}

    for symbol_id in sequence:
        if symbol_id in _id_to_phonemes:
            s = _id_to_phonemes[symbol_id]
            result += s
    return result.replace('}{', ' ')
示例#9
0
文件: __init__.py 项目: rvirgilli/TTS
def sequence_to_text(sequence, tp=None):
    '''Converts a sequence of IDs back to a string'''
    global _ID_TO_SYMBOL
    if tp:
        symbols, _ = make_symbols(**tp)
        _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)}

    result = ''
    for symbol_id in sequence:
        if symbol_id in _ID_TO_SYMBOL:
            s = _ID_TO_SYMBOL[symbol_id]
            # Enclose ARPAbet back in curly braces:
            if len(s) > 1 and s[0] == '@':
                s = '{%s}' % s[1:]
            result += s
    return result.replace('}{', ' ')
示例#10
0
def sequence_to_text(sequence, tp=None):
    '''Converts a sequence of IDs back to a string'''
    # pylint: disable=global-statement
    global _id_to_symbol
    if tp:
        _symbols, _ = make_symbols(**tp)
        _id_to_symbol = {i: s for i, s in enumerate(_symbols)}

    result = ''
    for symbol_id in sequence:
        if symbol_id in _id_to_symbol:
            s = _id_to_symbol[symbol_id]
            # Enclose ARPAbet back in curly braces:
            if len(s) > 1 and s[0] == '@':
                s = '{%s}' % s[1:]
            result += s
    return result.replace('}{', ' ')
示例#11
0
    args = parser.parse_args()

    if args.vocoder_path != "":
        assert args.use_cuda, " [!] Enable cuda for vocoder."
        from WaveRNN.models.wavernn import Model as VocoderModel

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)

    # load speakers
    if args.speakers_json != '':
        speakers = json.load(open(args.speakers_json, 'r'))
        num_speakers = len(speakers)
    else:
        num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
    cp = torch.load(args.model_path)
    model.load_state_dict(cp['model'])
    model.eval()
    if args.use_cuda:
示例#12
0
def main(args):  # pylint: disable=redefined-outer-name
    global meta_data_train, meta_data_eval, symbols, phonemes
    # Audio processor
    ap = AudioProcessor(**c.audio)
    print(" > TTS symbols {}".format(len(symbols)))
    print(symbols)
    print(" > TTS phonemes {}".format(len(phonemes)))
    print(phonemes)
    print('-' * 50)
    # if the vocabulary was passed, replace the default
    if 'text' in c.keys():
        symbols, phonemes = make_symbols(**c.text)

    print(" > TTS symbols {}".format(len(symbols)))
    print(symbols)
    print(" > TTS phonemes {}".format(len(phonemes)))
    print(phonemes)

    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])

    num_chars = len(phonemes) if c.use_phonemes else len(symbols)

    # load data instances
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    # parse speakers
    if c.use_speaker_embedding:
        speakers = get_speakers(meta_data_train)
        if args.restore_path:
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                   "introduce new speakers to " \
                                                   "a previously trained model."
        else:
            speaker_mapping = {name: i for i, name in enumerate(speakers)}
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
        print("Training with {} speakers: {}".format(num_speakers,
                                                     ", ".join(speakers)))
    else:
        num_speakers = 0
    print(" | > Num chars : {}".format(num_chars))
    model = setup_model(num_chars, num_speakers, c)

    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    params = set_weight_decay(model, c.wd)
    optimizer = RAdam(params, lr=c.lr, weight_decay=0)
    if c.stopnet and c.separate_stopnet:
        optimizer_st = RAdam(model.decoder.stopnet.parameters(),
                             lr=c.lr,
                             weight_decay=0)
    else:
        optimizer_st = None

    if c.loss_masking:
        criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
                                                  ] else MSELossMasked()
    else:
        criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
                                               ] else nn.MSELoss()
    criterion_st = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor(10)) if c.stopnet else None

    if args.restore_path:
        checkpoint = torch.load(args.restore_path, map_location='cpu')
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint['model'])
        except:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0

    if use_cuda:
        model.cuda()
        criterion.cuda()
        if criterion_st:
            criterion_st.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.noam_schedule:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    global_step = args.restore_step
    for epoch in range(0, c.epochs):
        # set gradual training
        if c.gradual_training is not None:
            r, c.batch_size = gradual_training_scheduler(global_step, c)
            c.r = r
            model.decoder.set_r(r)
            if c.bidirectional_decoder:
                model.decoder_backward.set_r(r)
        print(" > Number of outputs per iteration:", model.decoder.r)

        train_loss, global_step = train(model, criterion, criterion_st,
                                        optimizer, optimizer_st, scheduler, ap,
                                        global_step, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
                            epoch)
        print(" | > Training Loss: {:.5f}   Validation Loss: {:.5f}".format(
            train_loss, val_loss),
              flush=True)
        target_loss = train_loss
        if c.run_eval:
            target_loss = val_loss
        best_loss = save_best_model(model, optimizer, target_loss, best_loss,
                                    OUT_PATH, global_step, epoch)
示例#13
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train, meta_data_eval, symbols, phonemes
    # Audio processor
    ap = AudioProcessor(**c.audio)
    if 'characters' in c.keys():
        symbols, phonemes = make_symbols(**c.characters)

    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)

    # load data instances
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    # parse speakers
    if c.use_speaker_embedding:
        speakers = get_speakers(meta_data_train)
        if args.restore_path:
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                   "introduce new speakers to " \
                                                   "a previously trained model."
        else:
            speaker_mapping = {name: i for i, name in enumerate(speakers)}
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
        print("Training with {} speakers: {}".format(num_speakers,
                                                     ", ".join(speakers)))
    else:
        num_speakers = 0

    model = setup_model(num_chars, num_speakers, c)

    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    params = set_weight_decay(model, c.wd)
    optimizer = RAdam(params, lr=c.lr, weight_decay=0)
    if c.stopnet and c.separate_stopnet:
        optimizer_st = RAdam(model.decoder.stopnet.parameters(),
                             lr=c.lr,
                             weight_decay=0)
    else:
        optimizer_st = None

    # setup criterion
    criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path, map_location='cpu')
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint['model'])
        except:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0

    if use_cuda:
        model.cuda()
        criterion.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.noam_schedule:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    global_step = args.restore_step
    for epoch in range(0, c.epochs):
        c_logger.print_epoch_start(epoch, c.epochs)
        # set gradual training
        if c.gradual_training is not None:
            r, c.batch_size = gradual_training_scheduler(global_step, c)
            c.r = r
            model.decoder.set_r(r)
            if c.bidirectional_decoder:
                model.decoder_backward.set_r(r)
            print("\n > Number of output frames:", model.decoder.r)

        train_avg_loss_dict, global_step = train(model, criterion, optimizer,
                                                 optimizer_st, scheduler, ap,
                                                 global_step, epoch)
        eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
        target_loss = train_avg_loss_dict['avg_postnet_loss']
        if c.run_eval:
            target_loss = eval_avg_loss_dict['avg_postnet_loss']
        best_loss = save_best_model(target_loss, best_loss, model, optimizer,
                                    global_step, epoch, c.r, OUT_PATH)
示例#14
0
def main(**kwargs):
    global symbols, phonemes # pylint: disable=global-statement
    current_date = date.today()
    current_date = current_date.strftime("%B %d %Y")
    start_time = time.time()

    # read passed variables from gui
    text = kwargs['text']                           # text to generate speech from
    use_cuda = kwargs['use_cuda']                   # if gpu exists default is true
    project = kwargs['project']                     # path to project folder
    vocoder_type = kwargs['vocoder']                # vocoder type, default is GL
    use_gst = kwargs['use_gst']                     # use style_wave for prosody
    style_dict = kwargs['style_input']              # use style_wave for prosody
    speaker_id = kwargs['speaker_id']               # name of the selected speaker
    sentence_file = kwargs['sentence_file']         # path to file if generate from file
    out_path = kwargs['out_path']                   # path to save the output wav

    batched_vocoder = True

    # load speakers
    speakers_file_path = Path(project, "speakers.json")
    if speakers_file_path.is_file():
        speaker_data = json.load(open(speakers_file_path, 'r'))
        num_speakers = len(speaker_data)
        #get the speaker id for selected speaker
        if speaker_id >= num_speakers:
            print('Speaker ID outside of number of speakers range. Using default 0.')
            speaker_id = 0
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
        else:
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
    else:
        speaker_name = 'Default'
        num_speakers = 0
        speaker_id = None

    # load the config
    config_path = Path(project, "config.json")
    C = load_config(config_path)

    if use_gst:
        if style_dict is not None:
            style_input = style_dict
    else:
        style_input = None

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)
        

    # find the tts model file in project folder
    try:
        tts_model_file = glob(str(Path(project, '*.pth.tar')))
        if not tts_model_file:
            raise FileNotFoundError
        model_path = tts_model_file[0]
    except FileNotFoundError:
        print('[!] TTS Model not found in path: "{}"'.format(project))

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)

    # if gpu is not available use cpu
    model, state = load_checkpoint(model, model_path, use_cuda=use_cuda)

    model.decoder.max_decoder_steps = 2000

    model.eval()
    print(' > Model step:', state['step'])
    print(' > Model r: ', state['r'])

    # load vocoder
    if vocoder_type is 'MelGAN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pth.tar')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')),
                                               str(model_file[0]),
                                               str(Path(project, 'vocoder/config.json')),
                                               use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)

    elif vocoder_type is 'WaveRNN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pkl')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)
    else:
        vocoder, ap_vocoder = None, None

    print(" > Vocoder: {}".format(vocoder_type))
    print(' > Using style input: {}\n'.format(style_input))

    if sentence_file != '':
        with open(sentence_file, "r", encoding='utf8') as f:
            list_of_sentences = [s.strip() for s in f.readlines()]
    else:
        list_of_sentences = [text.strip()]

    # iterate over every passed sentence and synthesize
    for _, tts_sentence in enumerate(list_of_sentences):
        wav_list = []
        # remove character which are not alphanumerical or contain ',. '
        tts_sentence = clean_sentence(tts_sentence) 
        print(" > Text: {}".format(tts_sentence))
        # build filename
        current_time = datetime.now().strftime("%H%M%S")
        file_name = ' '.join(tts_sentence.split(" ")[:10])
        # if multiple sentences in one line -> split them
        tts_sentence = split_into_sentences(tts_sentence)
        
        # if sentence was split in sub-sentences -> iterate over them
        for sentence in tts_sentence:
            # synthesize voice
            _, _, _, wav = tts(model,
                               vocoder,
                               C,
                               None,
                               sentence,
                               ap,
                               ap_vocoder,
                               use_cuda,
                               batched_vocoder,
                               speaker_id=speaker_id,
                               style_input=style_input,
                               figures=False)

            # join sub-sentences back together and add a filler between them
            wav_list += list(wav)
            wav_list += [0] * 10000

        wav = np.array(wav_list)

        # finalize filename
        file_name = "_".join([str(current_time), file_name])
        file_name = file_name.translate(
            str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'

        if out_path == "":
            out_dir = str(Path(project, 'output', current_date, speaker_name))
            out_path = os.path.join(out_dir, file_name)
        else:
            out_dir = os.path.dirname(out_path)

        # create output directory if it doesn't exist
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir, exist_ok=True)

        # save generated wav to disk
        ap.save_wav(wav, out_path)
        end_time = time.time()
        print(" > Run-time: {}".format(end_time - start_time))
        print(" > Saving output to {}\n".format(out_path))
示例#15
0
    args = parser.parse_args()

    if args.vocoder_path != "":
        assert args.use_cuda, " [!] Enable cuda for vocoder."
        from WaveRNN.models.wavernn import Model as VocoderModel

    # load the config
    C = load_config(args.config_path)
    C.forward_attn_mask = True

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'text' in C.keys():
        symbols, phonemes = make_symbols(**C.text)

    # load speakers
    if args.speakers_json != '':
        speakers = json.load(open(args.speakers_json, 'r'))
        num_speakers = len(speakers)
    else:
        num_speakers = 0

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)
    cp = torch.load(args.model_path)
    model.load_state_dict(cp['model'])
    model.eval()
    if args.use_cuda: