def test_load_and_save_checkpoint(): model_path = os.path.join("test_samples", "model.pt") model = Tacotron2() lr = 0.1 symbols = list("ABC") optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=WEIGHT_DECAY) model, optimizer, iteration, epoch = load_checkpoint(model_path, model, optimizer, [None] * 10000) assert iteration == 510000 assert epoch == iteration // 10000 checkpoint_folder = "test-checkpoints" os.makedirs(checkpoint_folder) save_checkpoint(model, optimizer, lr, iteration, symbols, epoch, checkpoint_folder, 1000, 1000) assert "checkpoint_510000" in os.listdir(checkpoint_folder) shutil.rmtree(checkpoint_folder)
def train( metadata_path, dataset_directory, output_directory, find_checkpoint=True, checkpoint_path=None, transfer_learning_path=None, epochs=8000, batch_size=None, logging=logging, ): assert torch.cuda.is_available( ), "You do not have Torch with CUDA installed. Please check CUDA & Pytorch install" os.makedirs(output_directory, exist_ok=True) available_memory_gb = get_available_memory() assert ( available_memory_gb >= MINIMUM_MEMORY_GB ), f"Required GPU with at least {MINIMUM_MEMORY_GB}GB memory. (only {available_memory_gb}GB available)" if not batch_size: batch_size = get_batch_size(available_memory_gb) learning_rate = get_learning_rate(batch_size) logging.info( f"Setting batch size to {batch_size}, learning rate to {learning_rate}. ({available_memory_gb}GB GPU memory free)" ) # Hyperparams train_size = 0.8 weight_decay = 1e-6 grad_clip_thresh = 1.0 iters_per_checkpoint = 1000 seed = 1234 symbols = "_-!'(),.:;? ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" # Set seed torch.manual_seed(seed) torch.cuda.manual_seed(seed) random.seed(seed) # Setup GPU torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False # Load model & optimizer logging.info("Loading model...") model = Tacotron2().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) criterion = Tacotron2Loss() logging.info("Loaded model") # Load data logging.info("Loading data...") with open(metadata_path, encoding="utf-8") as f: filepaths_and_text = [line.strip().split("|") for line in f] random.shuffle(filepaths_and_text) train_cutoff = int(len(filepaths_and_text) * train_size) train_files = filepaths_and_text[:train_cutoff] test_files = filepaths_and_text[train_cutoff:] print(f"{len(train_files)} train files, {len(test_files)} test files") trainset = VoiceDataset(train_files, dataset_directory, symbols, seed) valset = VoiceDataset(test_files, dataset_directory, symbols, seed) collate_fn = TextMelCollate() # Data loaders train_loader = DataLoader(trainset, num_workers=1, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=1, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) logging.info("Loaded data") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if find_checkpoint and not checkpoint_path and not transfer_learning_path: checkpoint_path = get_latest_checkpoint(output_directory) if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 epoch_offset = max(0, int(iteration / len(train_loader))) logging.info("Loaded checkpoint '{}' from iteration {}".format( checkpoint_path, iteration)) elif transfer_learning_path: model = warm_start_model(transfer_learning_path, model) logging.info("Loaded transfer learning model '{}'".format( transfer_learning_path)) num_iterations = len(train_loader) * epochs - epoch_offset check_space(num_iterations // iters_per_checkpoint) model.train() for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) logging.info(f"Progress - {epoch}/{epochs}") for _, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group["lr"] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) reduced_loss = loss.item() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_thresh) optimizer.step() duration = time.perf_counter() - start logging.info( "Status - [Epoch {}: Iteration {}] Train loss {:.6f} Grad Norm {:.6f} {:.2f}s/it" .format(epoch, iteration, reduced_loss, grad_norm, duration)) if iteration % iters_per_checkpoint == 0: validate(model, val_loader, criterion, iteration) checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) logging.info( "Saving model and optimizer state at iteration {} to {}". format(iteration, checkpoint_path)) iteration += 1 validate(model, val_loader, criterion, iteration) checkpoint_path = os.path.join(output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) logging.info( "Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path))
def train( metadata_path, dataset_directory, output_directory, find_checkpoint=True, checkpoint_path=None, transfer_learning_path=None, overwrite_checkpoints=True, epochs=8000, batch_size=None, early_stopping=True, logging=logging, ): """ Trains the Tacotron2 model. Parameters ---------- metadata_path : str Path to label file dataset_directory : str Path to dataset clips output_directory : str Path to save checkpoints to find_checkpoint : bool (optional) Search for latest checkpoint to continue training from (default is True) checkpoint_path : str (optional) Path to a checkpoint to load (default is None) transfer_learning_path : str (optional) Path to a transfer learning checkpoint to use (default is None) overwrite_checkpoints : bool (optional) Whether to overwrite old checkpoints (default is True) epochs : int (optional) Number of epochs to run training for (default is 8000) batch_size : int (optional) Training batch size (calculated automatically if None) early_stopping : bool (optional) Whether to stop training when loss stops significantly decreasing (default is True) logging : logging (optional) Logging object to write logs to Raises ------- AssertionError If CUDA is not available or there is not enough GPU memory RuntimeError If the batch size is too high (causing CUDA out of memory) """ assert torch.cuda.is_available( ), "You do not have Torch with CUDA installed. Please check CUDA & Pytorch install" os.makedirs(output_directory, exist_ok=True) available_memory_gb = get_available_memory() assert ( available_memory_gb >= MINIMUM_MEMORY_GB ), f"Required GPU with at least {MINIMUM_MEMORY_GB}GB memory. (only {available_memory_gb}GB available)" if not batch_size: batch_size = get_batch_size(available_memory_gb) learning_rate = get_learning_rate(batch_size) logging.info( f"Setting batch size to {batch_size}, learning rate to {learning_rate}. ({available_memory_gb}GB GPU memory free)" ) # Set seed torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) random.seed(SEED) # Setup GPU torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False # Load model & optimizer logging.info("Loading model...") model = Tacotron2().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY) criterion = Tacotron2Loss() logging.info("Loaded model") # Load data logging.info("Loading data...") with open(metadata_path, encoding="utf-8") as f: filepaths_and_text = [line.strip().split("|") for line in f] random.shuffle(filepaths_and_text) train_cutoff = int(len(filepaths_and_text) * TRAIN_SIZE) train_files = filepaths_and_text[:train_cutoff] test_files = filepaths_and_text[train_cutoff:] print(f"{len(train_files)} train files, {len(test_files)} test files") trainset = VoiceDataset(train_files, dataset_directory, SYMBOLS, SEED) valset = VoiceDataset(test_files, dataset_directory, SYMBOLS, SEED) collate_fn = TextMelCollate() # Data loaders train_loader = DataLoader(trainset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) logging.info("Loaded data") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if find_checkpoint and not checkpoint_path and not transfer_learning_path: checkpoint_path = get_latest_checkpoint(output_directory) if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 epoch_offset = max(0, int(iteration / len(train_loader))) logging.info("Loaded checkpoint '{}' from iteration {}".format( checkpoint_path, iteration)) elif transfer_learning_path: model = warm_start_model(transfer_learning_path, model) logging.info("Loaded transfer learning model '{}'".format( transfer_learning_path)) # Check available memory if not overwrite_checkpoints: num_iterations = len(train_loader) * epochs - epoch_offset check_space(num_iterations // ITERS_PER_CHECKPOINT) model.train() validation_losses = [] for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) logging.info(f"Progress - {epoch}/{epochs}") for _, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group["lr"] = learning_rate # Backpropogation model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) reduced_loss = loss.item() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_THRESH) optimizer.step() duration = time.perf_counter() - start logging.info( "Status - [Epoch {}: Iteration {}] Train loss {:.6f} {:.2f}s/it" .format(epoch, iteration, reduced_loss, duration)) # Validate & save checkpoint if iteration % ITERS_PER_CHECKPOINT == 0: val_loss = validate(model, val_loader, criterion, iteration) validation_losses.append(val_loss) logging.info( "Saving model and optimizer state at iteration {} to {}. Scored {}" .format(iteration, output_directory, val_loss)) save_checkpoint(model, optimizer, learning_rate, iteration, output_directory, overwrite_checkpoints) iteration += 1 # Early Stopping if early_stopping and len(validation_losses) >= EARLY_STOPPING_WINDOW: losses = validation_losses[-EARLY_STOPPING_WINDOW:] difference = max(losses) - min(losses) if difference < EARLY_STOPPING_MIN_DIFFERENCE: logging.info( "Stopping training early as loss is no longer decreasing") logging.info(f"Progress - {epoch}/{epoch}") break validate(model, val_loader, criterion, iteration) checkpoint_path = os.path.join(output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) logging.info( "Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path))
def train( audio_directory, output_directory, metadata_path=None, trainlist_path=None, vallist_path=None, symbols=DEFAULT_ALPHABET, checkpoint_path=None, transfer_learning_path=None, epochs=8000, batch_size=None, early_stopping=True, multi_gpu=True, iters_per_checkpoint=1000, iters_per_backup_checkpoint=10000, train_size=0.8, alignment_sentence="", logging=logging, ): """ Trains the Tacotron2 model. Parameters ---------- audio_directory : str Path to dataset clips output_directory : str Path to save checkpoints to metadata_path : str (optional) Path to label file trainlist_path : str (optional) Path to trainlist file vallist_path : str (optional) Path to vallist file symbols : list (optional) Valid symbols (default is English) checkpoint_path : str (optional) Path to a checkpoint to load (default is None) transfer_learning_path : str (optional) Path to a transfer learning checkpoint to use (default is None) epochs : int (optional) Number of epochs to run training for (default is 8000) batch_size : int (optional) Training batch size (calculated automatically if None) early_stopping : bool (optional) Whether to stop training when loss stops significantly decreasing (default is True) multi_gpu : bool (optional) Use multiple GPU's in parallel if available (default is True) iters_per_checkpoint : int (optional) How often temporary checkpoints are saved (number of iterations) iters_per_backup_checkpoint : int (optional) How often backup checkpoints are saved (number of iterations) train_size : float (optional) Percentage of samples to use for training (default is 80%/0.8) alignment_sentence : str (optional) Sentence for alignment graph to analyse performance logging : logging (optional) Logging object to write logs to Raises ------- AssertionError If CUDA is not available or there is not enough GPU memory RuntimeError If the batch size is too high (causing CUDA out of memory) """ assert metadata_path or ( trainlist_path and vallist_path ), "You must give the path to your metadata file or trainlist/vallist files" assert torch.cuda.is_available( ), "You do not have Torch with CUDA installed. Please check CUDA & Pytorch install" os.makedirs(output_directory, exist_ok=True) available_memory_gb = get_available_memory() assert ( available_memory_gb >= MINIMUM_MEMORY_GB ), f"Required GPU with at least {MINIMUM_MEMORY_GB}GB memory. (only {available_memory_gb}GB available)" if not batch_size: batch_size = get_batch_size(available_memory_gb) learning_rate = get_learning_rate(batch_size) logging.info( f"Setting batch size to {batch_size}, learning rate to {learning_rate}. ({available_memory_gb}GB GPU memory free)" ) # Set seed torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) random.seed(SEED) # Setup GPU torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False # Load model & optimizer logging.info("Loading model...") model = Tacotron2().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=WEIGHT_DECAY) criterion = Tacotron2Loss() logging.info("Loaded model") # Load data logging.info("Loading data...") if metadata_path: # metadata.csv filepaths_and_text = load_labels_file(metadata_path) random.shuffle(filepaths_and_text) train_files, test_files = train_test_split(filepaths_and_text, train_size) else: # trainlist.txt & vallist.txt train_files = load_labels_file(trainlist_path) test_files = load_labels_file(vallist_path) filepaths_and_text = train_files + test_files validate_dataset(filepaths_and_text, audio_directory, symbols) trainset = VoiceDataset(train_files, audio_directory, symbols) valset = VoiceDataset(test_files, audio_directory, symbols) collate_fn = TextMelCollate() # Data loaders train_loader = DataLoader(trainset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loader = DataLoader(valset, num_workers=0, sampler=None, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) logging.info("Loaded data") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path: if transfer_learning_path: logging.info( "Ignoring transfer learning as checkpoint already exists") model, optimizer, iteration, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer, train_loader) iteration += 1 logging.info("Loaded checkpoint '{}' from iteration {}".format( checkpoint_path, iteration)) elif transfer_learning_path: model = warm_start_model(transfer_learning_path, model, symbols) logging.info("Loaded transfer learning model '{}'".format( transfer_learning_path)) else: logging.info("Generating first checkpoint...") # Enable Multi GPU if multi_gpu and torch.cuda.device_count() > 1: logging.info(f"Using {torch.cuda.device_count()} GPUs") model = nn.DataParallel(model) # Alignment sentence alignment_sequence = None alignment_folder = None if alignment_sentence: alignment_sequence = text_to_sequence( clean_text(alignment_sentence.strip(), symbols), symbols) alignment_folder = os.path.join(TRAINING_PATH, Path(output_directory).stem) os.makedirs(alignment_folder, exist_ok=True) model.train() validation_losses = [] for epoch in range(epoch_offset, epochs): logging.info(f"Progress - {epoch}/{epochs}") for _, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group["lr"] = learning_rate # Backpropogation model.zero_grad() y, y_pred = process_batch(batch, model) loss = criterion(y_pred, y) avgmax_attention = calc_avgmax_attention(batch[-1], batch[1], y_pred[-1]) reduced_loss = loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_THRESH) optimizer.step() duration = time.perf_counter() - start logging.info( "Status - [Epoch {}: Iteration {}] Train loss {:.5f} Attention score {:.5f} {:.2f}s/it" .format(epoch, iteration, reduced_loss, avgmax_attention, duration)) # Validate & save checkpoint if iteration % iters_per_checkpoint == 0: logging.info("Validating model") val_loss, avgmax_attention = validate(model, val_loader, criterion, iteration) validation_losses.append(val_loss) logging.info( "Saving model and optimizer state at iteration {} to {}. Validation score = {:.5f}, Attention score = {:.5f}" .format(iteration, output_directory, val_loss, avgmax_attention)) checkpoint_path = save_checkpoint( model, optimizer, learning_rate, iteration, symbols, epoch, output_directory, iters_per_checkpoint, iters_per_backup_checkpoint, ) if alignment_sequence is not None: try: _, _, _, alignment = load_model( checkpoint_path).inference(alignment_sequence) graph_path = os.path.join( alignment_folder, "checkpoint_{}.png".format(iteration)) generate_graph(alignment, graph_path, heading=f"Iteration {iteration}") graph = os.path.relpath(graph_path).replace("\\", "/") logging.info(f"Alignment - {iteration}, {graph}") except Exception: logging.info( "Failed to generate alignment sample, you may need to train for longer before this is possible" ) iteration += 1 # Early Stopping if early_stopping and check_early_stopping(validation_losses): logging.info( "Stopping training early as loss is no longer decreasing") break logging.info(f"Progress - {epochs}/{epochs}") validate(model, val_loader, criterion, iteration) save_checkpoint( model, optimizer, learning_rate, iteration, symbols, epochs, output_directory, iters_per_checkpoint, iters_per_backup_checkpoint, ) logging.info( "Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path))