def validate_dataset(filepaths_and_text, dataset_directory, symbols): """ Validates dataset has required files and a valid character set Parameters ---------- filepaths_and_text : list List of samples dataset_directory : str Path to dataset audio directory symbols : list List of supported symbols Raises ------- AssertionError If files are missing or invalid characters are found """ missing_files = set() invalid_characters = set() wavs = os.listdir(dataset_directory) for filename, text in filepaths_and_text: text = clean_text(text, remove_invalid_characters=False) if filename not in wavs: missing_files.add(filename) invalid_characters_for_row = get_invalid_characters(text, symbols) if invalid_characters_for_row: invalid_characters.update(invalid_characters_for_row) assert not missing_files, f"Missing files: {(',').join(missing_files)}" assert ( not invalid_characters ), f"Invalid characters in text (for alphabet): {','.join([f'{c} ({unicodedata.name(c)})' for c in invalid_characters])}"
def synthesize(model, text, inflect_engine, graph=None, audio=None, vocoder=None, vocoder_type=None): """ Synthesise text for a given model. Produces graph and/or audio file when given. Parameters ---------- model : Tacotron2 Tacotron2 model waveglow_model : Torch Waveglow model text : str Text to synthesize inflect_engine : Inflect Inflect.engine() object graph : str (optional) Path to save alignment graph to audio : str (optional) Path to save audio file to """ text = clean_text(text, inflect_engine) sequence = text_to_sequence(text) _, mel_outputs_postnet, _, alignments = model.inference(sequence) if graph: generate_graph(alignments, graph) if audio: assert vocoder, "Missing vocoder" if vocoder_type == "hifigan": generate_audio_hifigan(vocoder, mel_outputs_postnet, audio) elif vocoder_type == "waveglow": generate_audio_waveglow(vocoder, mel_outputs_postnet, audio) else: raise Exception(f"Unsupported vocoder type {vocoder_type}")
def synthesize(model, waveglow_model, text, inflect_engine, graph=None, audio=None): """ Synthesise text for a given model. Produces graph and/or audio file when given. Parameters ---------- model : Tacotron2 Tacotron2 model waveglow_model : Torch Waveglow model text : str Text to synthesize inflect_engine : Inflect Inflect.engine() object graph : str (optional) Path to save alignment graph to audio : str (optional) Path to save audio file to """ text = clean_text(text, inflect_engine) sequence = text_to_sequence(text) _, mel_outputs_postnet, _, alignments = model.inference(sequence) if graph: generate_graph(alignments, graph) if audio: generate_audio(mel_outputs_postnet, waveglow_model, audio)
def synthesize(model, waveglow_model, text, inflect_engine, graph=None, audio=None): text = clean_text(text, inflect_engine) sequence = text_to_sequence(text) _, mel_outputs_postnet, _, alignments = model.inference(sequence) if graph: generate_graph(alignments, graph) if audio: generate_audio(mel_outputs_postnet, waveglow_model, audio)
def get_text(self, text): """ Gets sequence data for given text Parameters ---------- text : str Transcription text Returns ------- Tensor Int tensor of symbol ids """ text = clean_text(text, self.symbols) sequence = [self.symbol_to_id[s] for s in text if s != "_"] text_norm = torch.IntTensor(sequence) return text_norm
def test_clean_text_with_custom_symbols(): text = clean_text("¿cómo estás?~\n", ["c", "ó", "m", "o", "e", "s", "t", "á", "s", "¿", "?", " "]) assert text == "¿cómo estás?"
def test_clean_text(): text = clean_text("1st $500 Mr. 10.5 2,000 30 a\tb ~") assert text == "first five hundred dollars mister ten point five two thousand thirty a b "
def get_text(self, text): text = clean_text(text, self.inflect_engine) sequence = [self.symbol_to_id[s] for s in text if s != "_"] text_norm = torch.IntTensor(sequence) return text_norm
def train( audio_directory, output_directory, metadata_path=None, trainlist_path=None, vallist_path=None, symbols=DEFAULT_ALPHABET, checkpoint_path=None, transfer_learning_path=None, epochs=8000, batch_size=None, early_stopping=True, multi_gpu=True, iters_per_checkpoint=1000, iters_per_backup_checkpoint=10000, train_size=0.8, alignment_sentence="", logging=logging, ): """ Trains the Tacotron2 model. Parameters ---------- audio_directory : str Path to dataset clips output_directory : str Path to save checkpoints to metadata_path : str (optional) Path to label file trainlist_path : str (optional) Path to trainlist file vallist_path : str (optional) Path to vallist file symbols : list (optional) Valid symbols (default is English) checkpoint_path : str (optional) Path to a checkpoint to load (default is None) transfer_learning_path : str (optional) Path to a transfer learning checkpoint to use (default is None) epochs : int (optional) Number of epochs to run training for (default is 8000) batch_size : int (optional) Training batch size (calculated automatically if None) early_stopping : bool (optional) Whether to stop training when loss stops significantly decreasing (default is True) multi_gpu : bool (optional) Use multiple GPU's in parallel if available (default is True) iters_per_checkpoint : int (optional) How often temporary checkpoints are saved (number of iterations) iters_per_backup_checkpoint : int (optional) How often backup checkpoints are saved (number of iterations) train_size : float (optional) Percentage of samples to use for training (default is 80%/0.8) alignment_sentence : str (optional) Sentence for alignment graph to analyse performance logging : logging (optional) Logging object to write logs to Raises ------- AssertionError If CUDA is not available or there is not enough GPU memory RuntimeError If the batch size is too high (causing CUDA out of memory) """ assert metadata_path or ( trainlist_path and vallist_path ), "You must give the path to your metadata file or trainlist/vallist files" assert torch.cuda.is_available( ), "You do not have Torch with CUDA installed. Please check CUDA & Pytorch install" os.makedirs(output_directory, exist_ok=True) available_memory_gb = get_available_memory() assert ( available_memory_gb >= MINIMUM_MEMORY_GB ), f"Required GPU with at least {MINIMUM_MEMORY_GB}GB memory. (only {available_memory_gb}GB available)" if not batch_size: batch_size = get_batch_size(available_memory_gb) learning_rate = get_learning_rate(batch_size) logging.info( f"Setting batch size to {batch_size}, learning rate to {learning_rate}. ({available_memory_gb}GB GPU memory free)" ) # Set seed torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) random.seed(SEED) # Setup GPU torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False # Load model & optimizer logging.info("Loading model...") model = Tacotron2().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY) criterion = Tacotron2Loss() logging.info("Loaded model") # Load data logging.info("Loading data...") if metadata_path: # metadata.csv filepaths_and_text = load_labels_file(metadata_path) random.shuffle(filepaths_and_text) train_files, test_files = train_test_split(filepaths_and_text, train_size) else: # trainlist.txt & vallist.txt train_files = load_labels_file(trainlist_path) test_files = load_labels_file(vallist_path) filepaths_and_text = train_files + test_files validate_dataset(filepaths_and_text, audio_directory, symbols) trainset = VoiceDataset(train_files, audio_directory, symbols) valset = VoiceDataset(test_files, audio_directory, symbols) collate_fn = TextMelCollate() # Data loaders train_loader = DataLoader(trainset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) logging.info("Loaded data") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path: if transfer_learning_path: logging.info( "Ignoring transfer learning as checkpoint already exists") model, optimizer, iteration, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer, train_loader) iteration += 1 logging.info("Loaded checkpoint '{}' from iteration {}".format( checkpoint_path, iteration)) elif transfer_learning_path: model = warm_start_model(transfer_learning_path, model, symbols) logging.info("Loaded transfer learning model '{}'".format( transfer_learning_path)) else: logging.info("Generating first checkpoint...") # Enable Multi GPU if multi_gpu and torch.cuda.device_count() > 1: logging.info(f"Using {torch.cuda.device_count()} GPUs") model = nn.DataParallel(model) # Alignment sentence alignment_sequence = None alignment_folder = None if alignment_sentence: alignment_sequence = text_to_sequence( clean_text(alignment_sentence.strip(), symbols), symbols) alignment_folder = os.path.join(TRAINING_PATH, Path(output_directory).stem) os.makedirs(alignment_folder, exist_ok=True) model.train() validation_losses = [] for epoch in range(epoch_offset, epochs): logging.info(f"Progress - {epoch}/{epochs}") for _, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group["lr"] = learning_rate # Backpropogation model.zero_grad() y, y_pred = process_batch(batch, model) loss = criterion(y_pred, y) avgmax_attention = calc_avgmax_attention(batch[-1], batch[1], y_pred[-1]) reduced_loss = loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_THRESH) optimizer.step() duration = time.perf_counter() - start logging.info( "Status - [Epoch {}: Iteration {}] Train loss {:.5f} Attention score {:.5f} {:.2f}s/it" .format(epoch, iteration, reduced_loss, avgmax_attention, duration)) # Validate & save checkpoint if iteration % iters_per_checkpoint == 0: logging.info("Validating model") val_loss, avgmax_attention = validate(model, val_loader, criterion, iteration) validation_losses.append(val_loss) logging.info( "Saving model and optimizer state at iteration {} to {}. Validation score = {:.5f}, Attention score = {:.5f}" .format(iteration, output_directory, val_loss, avgmax_attention)) checkpoint_path = save_checkpoint( model, optimizer, learning_rate, iteration, symbols, epoch, output_directory, iters_per_checkpoint, iters_per_backup_checkpoint, ) if alignment_sequence is not None: try: _, _, _, alignment = load_model( checkpoint_path).inference(alignment_sequence) graph_path = os.path.join( alignment_folder, "checkpoint_{}.png".format(iteration)) generate_graph(alignment, graph_path, heading=f"Iteration {iteration}") graph = os.path.relpath(graph_path).replace("\\", "/") logging.info(f"Alignment - {iteration}, {graph}") except Exception: logging.info( "Failed to generate alignment sample, you may need to train for longer before this is possible" ) iteration += 1 # Early Stopping if early_stopping and check_early_stopping(validation_losses): logging.info( "Stopping training early as loss is no longer decreasing") break logging.info(f"Progress - {epochs}/{epochs}") validate(model, val_loader, criterion, iteration) save_checkpoint( model, optimizer, learning_rate, iteration, symbols, epochs, output_directory, iters_per_checkpoint, iters_per_backup_checkpoint, ) logging.info( "Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path))
def synthesize( model, text, symbols=DEFAULT_ALPHABET, graph_path=None, audio_path=None, vocoder=None, silence_padding=0.15, sample_rate=22050, max_decoder_steps=1000, split_text=False, ): """ Synthesise text for a given model. Produces graph and/or audio file when given. Supports multi line synthesis (seperated by \n). Parameters ---------- model : Tacotron2 Tacotron2 model text : str/list Text to synthesize (or list of lines to synthesize) symbols : list List of symbols (default is English) graph_path : str (optional) Path to save alignment graph to audio_path : str (optional) Path to save audio file to vocoder : Object (optional) Vocoder model (required if generating audio) silence_padding : float (optional) Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15) sample_rate : int (optional) Audio sample rate (default is 22050) max_decoder_steps : int (optional) Max decoder steps controls sequence length and memory usage during inference. Increasing this will use more memory but may allow for longer sentences. (default is 1000) split_text : bool (optional) Whether to use the split text tool to convert a block of text into multiple shorter sentences to synthesize (default is True) Raises ------- AssertionError If audio_path is given without a vocoder """ if audio_path: assert vocoder, "Missing vocoder" if not isinstance(text, list) and split_text: # Split text into multiple lines text = nltk.tokenize.sent_tokenize(text) if isinstance(text, list): # Multi-lines given text = [line.strip() for line in text if line.strip()] mels = [] alignments = [] for line in text: text = clean_text(line, symbols) sequence = text_to_sequence(text, symbols) _, mel_outputs_postnet, _, alignment = model.inference( sequence, max_decoder_steps) mels.append(mel_outputs_postnet) alignments.append(alignment) if graph_path: generate_graph(join_alignment_graphs(alignments), graph_path) if audio_path: silence = np.zeros(int(silence_padding * sample_rate)).astype("int16") audio_segments = [] for i in range(len(mels)): audio_segments.append(vocoder.generate_audio(mels[i])) if i != len(mels) - 1: audio_segments.append(silence) audio = np.concatenate(audio_segments) write(audio_path, sample_rate, audio) else: # Single sentence text = clean_text(text.strip(), symbols) sequence = text_to_sequence(text, symbols) _, mel_outputs_postnet, _, alignment = model.inference( sequence, max_decoder_steps) if graph_path: generate_graph(alignment, graph_path) if audio_path: audio = vocoder.generate_audio(mel_outputs_postnet) write(audio_path, sample_rate, audio)