def text_to_sequence(text, cleaner_names, tp=None): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." Args: text: string to convert to a sequence cleaner_names: names of the cleaner functions to run the text through Returns: List of integers corresponding to the symbols in the text ''' # pylint: disable=global-statement global _symbol_to_id if tp: _symbols, _ = make_symbols(**tp) _symbol_to_id = {s: i for i, s in enumerate(_symbols)} sequence = [] # Check for curly braces and treat their contents as ARPAbet: while text: m = _CURLY_RE.match(text) if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) return sequence
def pad_with_eos_bos(phoneme_sequence, tp=None): # pylint: disable=global-statement global _phonemes_to_id, _bos, _eos if tp: _bos = tp['bos'] _eos = tp['eos'] _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} return [_phonemes_to_id[_bos] ] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, 10, 10, 1, output_path)
def sequence_to_phoneme(sequence, tp=None): # pylint: disable=global-statement '''Converts a sequence of IDs back to a string''' global _id_to_phonemes result = '' if tp: _, _phonemes = make_symbols(**tp) _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)} for symbol_id in sequence: if symbol_id in _id_to_phonemes: s = _id_to_phonemes[symbol_id] result += s return result.replace('}{', ' ')
def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' # pylint: disable=global-statement global _id_to_symbol if tp: _symbols, _ = make_symbols(**tp) _id_to_symbol = {i: s for i, s in enumerate(_symbols)} result = '' for symbol_id in sequence: if symbol_id in _id_to_symbol: s = _id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] result += s return result.replace('}{', ' ')
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): # pylint: disable=global-statement global _phonemes_to_id if tp: _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} sequence = [] clean_text = _clean_text(text, cleaner_names) to_phonemes = text2phone(clean_text, language) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format( clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. for phoneme in filter(None, to_phonemes.split('|')): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: sequence = pad_with_eos_bos(sequence, tp=tp) return sequence
'--gst_style', help="Wav path file for GST stylereference.", default=None) args = parser.parse_args() # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if args.speakers_json != '': speaker_mapping = json.load(open(args.speakers_json, 'r')) num_speakers = len(speaker_mapping) if C.use_external_speaker_embedding_file: if args.speaker_fileid is not None: speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding'] else: # if speaker_fileid is not specificated use the first sample in speakers.json speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'] speaker_embedding_dim = len(speaker_embedding)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # set the portion of the data used for training if 'train_portion' in c.keys(): meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)] if 'eval_portion' in c.keys(): meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) if not speaker_mapping: print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file speaker_mapping = {name: i for i, name in enumerate(speakers)} speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None speaker_mapping = None model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.apex_amp_level == "O1": # pylint: disable=import-outside-toplevel from apex import amp model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict if amp and 'amp' in checkpoint: amp.load_state_dict(checkpoint['amp']) for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, amp, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)