示例#1
0
def validate_dataset(filepaths_and_text, dataset_directory, symbols):
    """
    Validates dataset has required files and a valid character set

    Parameters
    ----------
    filepaths_and_text : list
        List of samples
    dataset_directory : str
        Path to dataset audio directory
    symbols : list
        List of supported symbols

    Raises
    -------
    AssertionError
        If files are missing or invalid characters are found
    """
    missing_files = set()
    invalid_characters = set()
    wavs = os.listdir(dataset_directory)
    for filename, text in filepaths_and_text:
        text = clean_text(text, remove_invalid_characters=False)
        if filename not in wavs:
            missing_files.add(filename)
        invalid_characters_for_row = get_invalid_characters(text, symbols)
        if invalid_characters_for_row:
            invalid_characters.update(invalid_characters_for_row)

    assert not missing_files, f"Missing files: {(',').join(missing_files)}"
    assert (
        not invalid_characters
    ), f"Invalid characters in text (for alphabet): {','.join([f'{c} ({unicodedata.name(c)})' for c in invalid_characters])}"
示例#2
0
def synthesize(model, text, inflect_engine, graph=None, audio=None, vocoder=None, vocoder_type=None):
    """
    Synthesise text for a given model.
    Produces graph and/or audio file when given.

    Parameters
    ----------
    model : Tacotron2
        Tacotron2 model
    waveglow_model : Torch
        Waveglow model
    text : str
        Text to synthesize
    inflect_engine : Inflect
        Inflect.engine() object
    graph : str (optional)
        Path to save alignment graph to
    audio : str (optional)
        Path to save audio file to
    """
    text = clean_text(text, inflect_engine)
    sequence = text_to_sequence(text)
    _, mel_outputs_postnet, _, alignments = model.inference(sequence)

    if graph:
        generate_graph(alignments, graph)

    if audio:
        assert vocoder, "Missing vocoder"
        if vocoder_type == "hifigan":
            generate_audio_hifigan(vocoder, mel_outputs_postnet, audio)
        elif vocoder_type == "waveglow":
            generate_audio_waveglow(vocoder, mel_outputs_postnet, audio)
        else:
            raise Exception(f"Unsupported vocoder type {vocoder_type}")
def synthesize(model,
               waveglow_model,
               text,
               inflect_engine,
               graph=None,
               audio=None):
    """
    Synthesise text for a given model.
    Produces graph and/or audio file when given.

    Parameters
    ----------
    model : Tacotron2
        Tacotron2 model
    waveglow_model : Torch
        Waveglow model
    text : str
        Text to synthesize
    inflect_engine : Inflect
        Inflect.engine() object
    graph : str (optional)
        Path to save alignment graph to
    audio : str (optional)
        Path to save audio file to
    """
    text = clean_text(text, inflect_engine)
    sequence = text_to_sequence(text)
    _, mel_outputs_postnet, _, alignments = model.inference(sequence)

    if graph:
        generate_graph(alignments, graph)

    if audio:
        generate_audio(mel_outputs_postnet, waveglow_model, audio)
示例#4
0
def synthesize(model,
               waveglow_model,
               text,
               inflect_engine,
               graph=None,
               audio=None):
    text = clean_text(text, inflect_engine)
    sequence = text_to_sequence(text)
    _, mel_outputs_postnet, _, alignments = model.inference(sequence)

    if graph:
        generate_graph(alignments, graph)

    if audio:
        generate_audio(mel_outputs_postnet, waveglow_model, audio)
    def get_text(self, text):
        """
        Gets sequence data for given text

        Parameters
        ----------
        text : str
            Transcription text

        Returns
        -------
        Tensor
            Int tensor of symbol ids
        """
        text = clean_text(text, self.symbols)
        sequence = [self.symbol_to_id[s] for s in text if s != "_"]
        text_norm = torch.IntTensor(sequence)
        return text_norm
示例#6
0
def test_clean_text_with_custom_symbols():
    text = clean_text("¿cómo estás?~\n", ["c", "ó", "m", "o", "e", "s", "t", "á", "s", "¿", "?", " "])
    assert text == "¿cómo estás?"
示例#7
0
def test_clean_text():
    text = clean_text("1st $500 Mr. 10.5 2,000 30 a\tb ~")
    assert text == "first five hundred dollars mister ten point five two thousand thirty a b "
示例#8
0
 def get_text(self, text):
     text = clean_text(text, self.inflect_engine)
     sequence = [self.symbol_to_id[s] for s in text if s != "_"]
     text_norm = torch.IntTensor(sequence)
     return text_norm
示例#9
0
def train(
    audio_directory,
    output_directory,
    metadata_path=None,
    trainlist_path=None,
    vallist_path=None,
    symbols=DEFAULT_ALPHABET,
    checkpoint_path=None,
    transfer_learning_path=None,
    epochs=8000,
    batch_size=None,
    early_stopping=True,
    multi_gpu=True,
    iters_per_checkpoint=1000,
    iters_per_backup_checkpoint=10000,
    train_size=0.8,
    alignment_sentence="",
    logging=logging,
):
    """
    Trains the Tacotron2 model.

    Parameters
    ----------
    audio_directory : str
        Path to dataset clips
    output_directory : str
        Path to save checkpoints to
    metadata_path : str (optional)
        Path to label file
    trainlist_path : str (optional)
        Path to trainlist file
    vallist_path : str (optional)
        Path to vallist file
    symbols : list (optional)
        Valid symbols (default is English)
    checkpoint_path : str (optional)
        Path to a checkpoint to load (default is None)
    transfer_learning_path : str (optional)
        Path to a transfer learning checkpoint to use (default is None)
    epochs : int (optional)
        Number of epochs to run training for (default is 8000)
    batch_size : int (optional)
        Training batch size (calculated automatically if None)
    early_stopping : bool (optional)
        Whether to stop training when loss stops significantly decreasing (default is True)
    multi_gpu : bool (optional)
        Use multiple GPU's in parallel if available (default is True)
    iters_per_checkpoint : int (optional)
        How often temporary checkpoints are saved (number of iterations)
    iters_per_backup_checkpoint : int (optional)
        How often backup checkpoints are saved (number of iterations)
    train_size : float (optional)
        Percentage of samples to use for training (default is 80%/0.8)
    alignment_sentence : str (optional)
        Sentence for alignment graph to analyse performance
    logging : logging (optional)
        Logging object to write logs to

    Raises
    -------
    AssertionError
        If CUDA is not available or there is not enough GPU memory
    RuntimeError
        If the batch size is too high (causing CUDA out of memory)
    """
    assert metadata_path or (
        trainlist_path and vallist_path
    ), "You must give the path to your metadata file or trainlist/vallist files"
    assert torch.cuda.is_available(
    ), "You do not have Torch with CUDA installed. Please check CUDA & Pytorch install"
    os.makedirs(output_directory, exist_ok=True)

    available_memory_gb = get_available_memory()
    assert (
        available_memory_gb >= MINIMUM_MEMORY_GB
    ), f"Required GPU with at least {MINIMUM_MEMORY_GB}GB memory. (only {available_memory_gb}GB available)"

    if not batch_size:
        batch_size = get_batch_size(available_memory_gb)

    learning_rate = get_learning_rate(batch_size)
    logging.info(
        f"Setting batch size to {batch_size}, learning rate to {learning_rate}. ({available_memory_gb}GB GPU memory free)"
    )

    # Set seed
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    random.seed(SEED)

    # Setup GPU
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False

    # Load model & optimizer
    logging.info("Loading model...")
    model = Tacotron2().cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=WEIGHT_DECAY)
    criterion = Tacotron2Loss()
    logging.info("Loaded model")

    # Load data
    logging.info("Loading data...")
    if metadata_path:
        # metadata.csv
        filepaths_and_text = load_labels_file(metadata_path)
        random.shuffle(filepaths_and_text)
        train_files, test_files = train_test_split(filepaths_and_text,
                                                   train_size)
    else:
        # trainlist.txt & vallist.txt
        train_files = load_labels_file(trainlist_path)
        test_files = load_labels_file(vallist_path)
        filepaths_and_text = train_files + test_files

    validate_dataset(filepaths_and_text, audio_directory, symbols)
    trainset = VoiceDataset(train_files, audio_directory, symbols)
    valset = VoiceDataset(test_files, audio_directory, symbols)
    collate_fn = TextMelCollate()

    # Data loaders
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              sampler=None,
                              batch_size=batch_size,
                              pin_memory=False,
                              collate_fn=collate_fn)
    val_loader = DataLoader(valset,
                            num_workers=0,
                            sampler=None,
                            batch_size=batch_size,
                            pin_memory=False,
                            collate_fn=collate_fn)
    logging.info("Loaded data")

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0

    if checkpoint_path:
        if transfer_learning_path:
            logging.info(
                "Ignoring transfer learning as checkpoint already exists")
        model, optimizer, iteration, epoch_offset = load_checkpoint(
            checkpoint_path, model, optimizer, train_loader)
        iteration += 1
        logging.info("Loaded checkpoint '{}' from iteration {}".format(
            checkpoint_path, iteration))
    elif transfer_learning_path:
        model = warm_start_model(transfer_learning_path, model, symbols)
        logging.info("Loaded transfer learning model '{}'".format(
            transfer_learning_path))
    else:
        logging.info("Generating first checkpoint...")

    # Enable Multi GPU
    if multi_gpu and torch.cuda.device_count() > 1:
        logging.info(f"Using {torch.cuda.device_count()} GPUs")
        model = nn.DataParallel(model)

    # Alignment sentence
    alignment_sequence = None
    alignment_folder = None
    if alignment_sentence:
        alignment_sequence = text_to_sequence(
            clean_text(alignment_sentence.strip(), symbols), symbols)
        alignment_folder = os.path.join(TRAINING_PATH,
                                        Path(output_directory).stem)
        os.makedirs(alignment_folder, exist_ok=True)

    model.train()
    validation_losses = []
    for epoch in range(epoch_offset, epochs):
        logging.info(f"Progress - {epoch}/{epochs}")
        for _, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group["lr"] = learning_rate

            # Backpropogation
            model.zero_grad()
            y, y_pred = process_batch(batch, model)

            loss = criterion(y_pred, y)
            avgmax_attention = calc_avgmax_attention(batch[-1], batch[1],
                                                     y_pred[-1])
            reduced_loss = loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           GRAD_CLIP_THRESH)
            optimizer.step()

            duration = time.perf_counter() - start
            logging.info(
                "Status - [Epoch {}: Iteration {}] Train loss {:.5f} Attention score {:.5f} {:.2f}s/it"
                .format(epoch, iteration, reduced_loss, avgmax_attention,
                        duration))

            # Validate & save checkpoint
            if iteration % iters_per_checkpoint == 0:
                logging.info("Validating model")
                val_loss, avgmax_attention = validate(model, val_loader,
                                                      criterion, iteration)
                validation_losses.append(val_loss)
                logging.info(
                    "Saving model and optimizer state at iteration {} to {}. Validation score = {:.5f}, Attention score = {:.5f}"
                    .format(iteration, output_directory, val_loss,
                            avgmax_attention))
                checkpoint_path = save_checkpoint(
                    model,
                    optimizer,
                    learning_rate,
                    iteration,
                    symbols,
                    epoch,
                    output_directory,
                    iters_per_checkpoint,
                    iters_per_backup_checkpoint,
                )
                if alignment_sequence is not None:
                    try:
                        _, _, _, alignment = load_model(
                            checkpoint_path).inference(alignment_sequence)
                        graph_path = os.path.join(
                            alignment_folder,
                            "checkpoint_{}.png".format(iteration))
                        generate_graph(alignment,
                                       graph_path,
                                       heading=f"Iteration {iteration}")
                        graph = os.path.relpath(graph_path).replace("\\", "/")
                        logging.info(f"Alignment - {iteration}, {graph}")
                    except Exception:
                        logging.info(
                            "Failed to generate alignment sample, you may need to train for longer before this is possible"
                        )

            iteration += 1

        # Early Stopping
        if early_stopping and check_early_stopping(validation_losses):
            logging.info(
                "Stopping training early as loss is no longer decreasing")
            break

    logging.info(f"Progress - {epochs}/{epochs}")
    validate(model, val_loader, criterion, iteration)
    save_checkpoint(
        model,
        optimizer,
        learning_rate,
        iteration,
        symbols,
        epochs,
        output_directory,
        iters_per_checkpoint,
        iters_per_backup_checkpoint,
    )
    logging.info(
        "Saving model and optimizer state at iteration {} to {}".format(
            iteration, checkpoint_path))
示例#10
0
def synthesize(
    model,
    text,
    symbols=DEFAULT_ALPHABET,
    graph_path=None,
    audio_path=None,
    vocoder=None,
    silence_padding=0.15,
    sample_rate=22050,
    max_decoder_steps=1000,
    split_text=False,
):
    """
    Synthesise text for a given model.
    Produces graph and/or audio file when given.
    Supports multi line synthesis (seperated by \n).

    Parameters
    ----------
    model : Tacotron2
        Tacotron2 model
    text : str/list
        Text to synthesize (or list of lines to synthesize)
    symbols : list
        List of symbols (default is English)
    graph_path : str (optional)
        Path to save alignment graph to
    audio_path : str (optional)
        Path to save audio file to
    vocoder : Object (optional)
        Vocoder model (required if generating audio)
    silence_padding : float (optional)
        Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15)
    sample_rate : int (optional)
        Audio sample rate (default is 22050)
    max_decoder_steps : int (optional)
        Max decoder steps controls sequence length and memory usage during inference.
        Increasing this will use more memory but may allow for longer sentences. (default is 1000)
    split_text : bool (optional)
        Whether to use the split text tool to convert a block of text into multiple shorter sentences
        to synthesize (default is True)

    Raises
    -------
    AssertionError
        If audio_path is given without a vocoder
    """
    if audio_path:
        assert vocoder, "Missing vocoder"

    if not isinstance(text, list) and split_text:
        # Split text into multiple lines
        text = nltk.tokenize.sent_tokenize(text)

    if isinstance(text, list):
        # Multi-lines given
        text = [line.strip() for line in text if line.strip()]
        mels = []
        alignments = []
        for line in text:
            text = clean_text(line, symbols)
            sequence = text_to_sequence(text, symbols)
            _, mel_outputs_postnet, _, alignment = model.inference(
                sequence, max_decoder_steps)
            mels.append(mel_outputs_postnet)
            alignments.append(alignment)

        if graph_path:
            generate_graph(join_alignment_graphs(alignments), graph_path)

        if audio_path:
            silence = np.zeros(int(silence_padding *
                                   sample_rate)).astype("int16")
            audio_segments = []
            for i in range(len(mels)):
                audio_segments.append(vocoder.generate_audio(mels[i]))
                if i != len(mels) - 1:
                    audio_segments.append(silence)

            audio = np.concatenate(audio_segments)
            write(audio_path, sample_rate, audio)
    else:
        # Single sentence
        text = clean_text(text.strip(), symbols)
        sequence = text_to_sequence(text, symbols)
        _, mel_outputs_postnet, _, alignment = model.inference(
            sequence, max_decoder_steps)

        if graph_path:
            generate_graph(alignment, graph_path)

        if audio_path:
            audio = vocoder.generate_audio(mel_outputs_postnet)
            write(audio_path, sample_rate, audio)