예제 #1
0
def upload_dataset():
    dataset = request.files["dataset"]
    dataset_name = request.values["name"]
    dataset_directory = os.path.join(paths["datasets"], dataset_name)
    audio_folder = os.path.join(dataset_directory, AUDIO_FOLDER)
    assert not os.path.isdir(dataset_directory), "Output folder already exists"

    with zipfile.ZipFile(dataset, mode="r") as z:
        files_list = z.namelist()
        if "metadata.csv" not in files_list:
            return render_template(
                "import-export.html",
                message=
                "Dataset missing metadata.csv. Make sure this file is in the root of the zip file",
            )

        folders = [x.split("/")[0] for x in files_list if "/" in x]
        if "wavs" not in folders:
            return render_template(
                "import-export.html",
                message=
                "Dataset missing wavs folder. Make sure this folder is in the root of the zip file",
            )

        wavs = [
            x for x in files_list
            if x.startswith("wavs/") and x.endswith(".wav")
        ]
        if not wavs:
            return render_template("import-export.html",
                                   message="No wavs found in wavs folder")

        os.makedirs(dataset_directory, exist_ok=False)
        os.makedirs(audio_folder, exist_ok=False)

        # Save metadata
        with open(os.path.join(dataset_directory, "metadata.csv"), "wb") as f:
            data = z.read("metadata.csv")
            f.write(data)

        # Save wavs
        for wav in wavs:
            data = z.read(wav)
            path = os.path.join(dataset_directory, "wavs", wav.split("/")[1])
            with open(path, "wb") as f:
                f.write(data)

        # Create info file
        save_dataset_info(
            os.path.join(dataset_directory, "metadata.csv"),
            os.path.join(dataset_directory, "wavs"),
            os.path.join(dataset_directory, "info.json"),
        )

    return render_template(
        "import-export.html",
        message=f"Successfully uploaded {dataset_name} dataset")
def extend_existing_dataset(
    text_path,
    audio_path,
    forced_alignment_path,
    output_path,
    label_path,
    suffix,
    info_path,
    logging=logging,
    min_confidence=0.85,
):
    """
    Extends an existing dataset.
    Converts audio to required format, generates clips & produces required files.

    Parameters
    ----------
    text_path : str
        Path to source text
    audio_path : str
        Path to source audio
    forced_alignment_path : str
        Path to save alignment JSON to
    output_path : str
        Path to save audio clips to
    label_path : str
        Path to save label file to
    suffix : str
        String suffix to append to filenames
    info_path : str
        Path to save info JSON to
    logging : logging (optional)
        Logging object to write logs to
    min_confidence : float (optional)
        Minimum confidence score to generate a clip for

    Raises
    -------
    AssertionError
        If given paths are invalid or clips could not be produced
    """
    assert os.path.isdir(output_path), "Missing existing dataset clips folder"
    assert os.path.isfile(label_path), "Missing existing dataset metadata file"
    logging.info(f"Coverting {audio_path}...")
    converted_audio = convert_audio(audio_path)
    extend_dataset(
        converted_audio,
        text_path,
        forced_alignment_path,
        output_path,
        label_path,
        suffix,
        logging=logging,
        min_confidence=min_confidence,
    )
    logging.info("Getting dataset info...")
    save_dataset_info(label_path, output_path, info_path)
예제 #3
0
def create_dataset(text_path, audio_path, forced_alignment_path, output_path,
                   label_path, info_path, logging):
    logging.info(f"Coverting {audio_path}...")
    converted_audio = convert_audio(audio_path)
    clip_generator(converted_audio,
                   text_path,
                   forced_alignment_path,
                   output_path,
                   label_path,
                   logging=logging)
    logging.info("Getting dataset info...")
    save_dataset_info(label_path, output_path, info_path)
예제 #4
0
def create_dataset(
    text_path,
    audio_path,
    forced_alignment_path,
    output_path,
    label_path,
    info_path,
    logging=logging,
    min_confidence=0.85,
):
    """
    Generates a dataset.
    Converts audio to required format, generates clips & produces required files.

    Parameters
    ----------
    text_path : str
        Path to source text
    audio_path : str
        Path to source audio
    forced_alignment_path : str
        Path to save alignment JSON to
    output_path : str
        Path to save audio clips to
    label_path : str
        Path to save label file to
    info_path : str
        Path to save info JSON to
    logging : logging (optional)
        Logging object to write logs to
    min_confidence : float (optional)
        Minimum confidence score to generate a clip for

    Raises
    -------
    AssertionError
        If given paths are invalid or clips could not be produced
    """
    logging.info(f"Coverting {audio_path}...")
    converted_audio = convert_audio(audio_path)
    clip_generator(
        converted_audio,
        text_path,
        forced_alignment_path,
        output_path,
        label_path,
        logging=logging,
        min_confidence=min_confidence,
    )
    logging.info("Getting dataset info...")
    save_dataset_info(label_path, output_path, info_path)
예제 #5
0
def extend_existing_dataset(text_path, audio_path, forced_alignment_path,
                            output_path, label_path, suffix, info_path,
                            logging):
    assert os.path.isdir(output_path), "Missing existing dataset clips folder"
    assert os.path.isfile(label_path), "Missing existing dataset metadata file"
    logging.info(f"Coverting {audio_path}...")
    converted_audio = convert_audio(audio_path)
    extend_dataset(converted_audio,
                   text_path,
                   forced_alignment_path,
                   output_path,
                   label_path,
                   suffix,
                   logging=logging)
    logging.info("Getting dataset info...")
    save_dataset_info(label_path, output_path, info_path)
예제 #6
0
def import_dataset(dataset, dataset_directory, audio_folder, logging):
    """
    Imports a dataset zip into the app.
    Checks required files are present, saves the files,
    converts the audio to the required format and generates the info file.
    Deletes given zip regardless of success.

    Parameters
    ----------
    dataset : str
        Path to dataset zip
    dataset_directory : str
        Destination path for the dataset
    audio_folder : str
        Destination path for the dataset audio
    logging : logging
        Logging object to write logs to

    Raises
    -------
    AssertionError
        If files are missing or invalid
    """
    try:
        with zipfile.ZipFile(dataset, mode="r") as z:
            files_list = z.namelist()

            assert ("metadata.csv" in files_list) or (
                "trainlist.txt" in files_list and "vallist.txt" in files_list
            ), "Dataset doesn't include metadata.csv or trainlist.txt/vallist.txt. Make sure this is in the root of the zip file"

            folders = [x.split("/")[0] for x in files_list if "/" in x]
            assert (
                "wavs" in folders
            ), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file"

            wavs = [x for x in files_list if x.startswith("wavs/") and x.endswith(".wav")]
            assert wavs, "No wavs found in wavs folder"

            logging.info("Creating directory")
            os.makedirs(dataset_directory, exist_ok=False)
            os.makedirs(audio_folder, exist_ok=False)

            if "metadata.csv" in files_list:
                metadata = z.read("metadata.csv").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n")
                num_metadata_rows = len([row for row in metadata.split("\n") if row])
                assert (
                    len(wavs) == num_metadata_rows
                ), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}"

                # Save metadata
                logging.info("Saving files")
                with open(os.path.join(dataset_directory, "metadata.csv"), "w", encoding=CHARACTER_ENCODING) as f:
                    f.write(metadata)
            else:
                trainlist = z.read("trainlist.txt").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n")
                vallist = z.read("vallist.txt").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n")
                num_rows = len([row for row in trainlist.split("\n") if row]) + len(
                    [row for row in vallist.split("\n") if row]
                )
                assert (
                    len(wavs) == num_rows
                ), f"Number of wavs and labels do not match. trainlist+vallist: {num_rows}, wavs: {len(wavs)}"

                # Save trainlist & vallist
                logging.info("Saving files")
                with open(os.path.join(dataset_directory, "trainlist.txt"), "w", encoding=CHARACTER_ENCODING) as f:
                    f.write(trainlist)
                with open(os.path.join(dataset_directory, "vallist.txt"), "w", encoding=CHARACTER_ENCODING) as f:
                    f.write(vallist)

            # Save wavs
            total_wavs = len(wavs)
            clip_lengths = []
            filenames = {}
            for i in range(total_wavs):
                wav = wavs[i]
                data = z.read(wav)
                path = os.path.join(dataset_directory, "wavs", wav.split("/")[1])
                with open(path, "wb") as f:
                    f.write(data)
                    new_path = convert_audio(path)
                    duration = librosa.get_duration(filename=new_path)
                    clip_lengths.append(duration)
                    filenames[path] = new_path
                logging.info(f"Progress - {i+1}/{total_wavs}")

            logging.info(f"Longest clip: {max(clip_lengths)}s, Shortest clip: {min(clip_lengths)}s")
            # Get around "file in use" by using delay
            logging.info("Deleting temp files")
            for old_path, new_path in filenames.items():
                os.remove(old_path)
                os.rename(new_path, old_path)

            # Create info file
            logging.info("Creating info file")
            words = (
                get_text(os.path.join(dataset_directory, "metadata.csv"))
                if "metadata.csv" in files_list
                else get_text(os.path.join(dataset_directory, "trainlist.txt"))
                + get_text(os.path.join(dataset_directory, "vallist.txt"))
            )
            save_dataset_info(
                words,
                os.path.join(dataset_directory, "wavs"),
                os.path.join(dataset_directory, "info.json"),
                clip_lengths=clip_lengths,
            )
    except Exception as e:
        os.remove(dataset)
        raise e

    os.remove(dataset)
def extend_existing_dataset(
    text_path,
    audio_path,
    transcription_model,
    output_folder,
    suffix,
    logging=logging,
    min_length=MIN_LENGTH,
    max_length=MAX_LENGTH,
    min_confidence=0.85,
    combine_clips=True,
    symbols=DEFAULT_ALPHABET,
):
    """
    Extends an existing dataset.
    Converts audio to required format, generates clips & produces required files.

    Parameters
    ----------
    text_path : str
        Path to source text
    audio_path : str
        Path to source audio
    transcription_model : TranscriptionModel
        Transcription model
    output_folder : str
        Path to save dataset to
    suffix : str
        String suffix to append to filenames
    logging : logging (optional)
        Logging object to write logs to
    min_length : float (optional)
        Minimum duration of a clip in seconds
    max_length : float (optional)
        Maximum duration of a clip in seconds
    min_confidence : float (optional)
        Minimum confidence score to generate a clip for
    combine_clips : bool (optional)
        Whether to combine clips to make them longer
    symbols : list[str] (optional)
        list of valid symbols default to DEFAULT_ALPHABET

    Raises
    -------
    AssertionError
        If given paths are invalid or clips could not be produced
    """
    assert os.path.isdir(
        output_folder), "Missing existing dataset clips folder"
    logging.info(f"Converting {audio_path}...")
    converted_audio = convert_audio(audio_path)

    forced_alignment_path = os.path.join(output_folder, ALIGNMENT_FILE)
    output_path = os.path.join(output_folder, AUDIO_FOLDER)
    unlabelled_path = os.path.join(output_folder, UNLABELLED_FOLDER)
    label_path = os.path.join(output_folder, METADATA_FILE)
    info_path = os.path.join(output_folder, INFO_FILE)
    temp_label_path = label_path.replace(Path(label_path).name, "temp.csv")
    temp_unlabelled_folder = unlabelled_path.replace(
        Path(unlabelled_path).name, "temp_unlabelled")
    temp_wavs_folder = output_path.replace(Path(output_path).name, "temp_wavs")

    clip_generator(
        converted_audio,
        text_path,
        transcription_model,
        forced_alignment_path,
        temp_wavs_folder,
        temp_unlabelled_folder,
        temp_label_path,
        logging=logging,
        symbols=symbols,
        min_length=min_length,
        max_length=max_length,
        min_confidence=min_confidence,
        combine_clips=combine_clips,
    )

    with open(temp_label_path) as f:
        new_labels = f.readlines()

    with open(label_path, "a+") as f:
        for line in new_labels:
            filename, text = line.split("|")
            new_filename = add_suffix(filename, suffix)
            f.write(f"{new_filename}|{text}")

    for filename in os.listdir(temp_wavs_folder):
        new_filename = add_suffix(filename, suffix)
        shutil.copyfile(os.path.join(temp_wavs_folder, filename),
                        os.path.join(output_path, new_filename))

    for filename in os.listdir(temp_unlabelled_folder):
        new_filename = add_suffix(filename, suffix)
        shutil.copyfile(os.path.join(temp_unlabelled_folder, filename),
                        os.path.join(unlabelled_path, new_filename))

    os.remove(temp_label_path)
    shutil.rmtree(temp_wavs_folder)
    shutil.rmtree(temp_unlabelled_folder)
    logging.info("Combined dataset")

    logging.info("Getting dataset info...")
    # Do not pass clip lengths from extend_dataset as we need to get size of entire dataset (not just new clips)
    save_dataset_info(label_path, output_path, info_path)
예제 #8
0
def create_dataset(
    text_path,
    audio_path,
    transcription_model,
    output_folder,
    logging=logging,
    min_length=MIN_LENGTH,
    max_length=MAX_LENGTH,
    min_confidence=0.85,
    combine_clips=True,
    symbols=DEFAULT_ALPHABET,
):
    """
    Generates a dataset.
    Converts audio to required format, generates clips & produces required files.

    Parameters
    ----------
    text_path : str
        Path to source text
    audio_path : str
        Path to source audio
    transcription_model : TranscriptionModel
        Transcription model
    output_folder : str
        Path to save dataset to
    logging : logging (optional)
        Logging object to write logs to
    min_length : float (optional)
        Minimum duration of a clip in seconds
    max_length : float (optional)
        Maximum duration of a clip in seconds
    min_confidence : float (optional)
        Minimum confidence score to generate a clip for
    combine_clips : bool (optional)
        Whether to combine clips to make them longer
    symbols : list[str] (optional)
        list of valid symbols default to DEFAULT_ALPHABET

    Raises
    -------
    AssertionError
        If given paths are invalid or clips could not be produced
    """
    logging.info(f"Converting {audio_path}...")
    converted_audio = convert_audio(audio_path)
    forced_alignment_path = os.path.join(output_folder, ALIGNMENT_FILE)
    output_path = os.path.join(output_folder, AUDIO_FOLDER)
    unlabelled_path = os.path.join(output_folder, UNLABELLED_FOLDER)
    label_path = os.path.join(output_folder, METADATA_FILE)
    info_path = os.path.join(output_folder, INFO_FILE)

    try:
        clip_lengths = clip_generator(
            converted_audio,
            text_path,
            transcription_model,
            forced_alignment_path,
            output_path,
            unlabelled_path,
            label_path,
            logging=logging,
            symbols=symbols,
            min_length=min_length,
            max_length=max_length,
            min_confidence=min_confidence,
            combine_clips=combine_clips,
        )
    except Exception as e:
        shutil.rmtree(output_folder)
        raise e

    logging.info("Getting dataset info...")
    save_dataset_info(label_path, output_path, info_path, clip_lengths)
예제 #9
0
def import_dataset(dataset, dataset_directory, audio_folder, logging):
    try:
        with zipfile.ZipFile(dataset, mode="r") as z:
            files_list = z.namelist()
            assert (
                "metadata.csv" in files_list
            ), "Dataset missing metadata.csv. Make sure this file is in the root of the zip file"

            folders = [x.split("/")[0] for x in files_list if "/" in x]
            assert (
                "wavs" in folders
            ), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file"

            wavs = [
                x for x in files_list
                if x.startswith("wavs/") and x.endswith(".wav")
            ]
            assert wavs, "No wavs found in wavs folder"

            metadata = z.read("metadata.csv").decode("utf-8",
                                                     "ignore").replace(
                                                         "\r\n", "\n")
            num_metadata_rows = len(
                [row for row in metadata.split("\n") if row])
            assert (
                len(wavs) == num_metadata_rows
            ), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}"

            logging.info("Creating directory")
            os.makedirs(dataset_directory, exist_ok=False)
            os.makedirs(audio_folder, exist_ok=False)

            # Save metadata
            logging.info("Saving files")
            with open(os.path.join(dataset_directory, "metadata.csv"),
                      "w",
                      encoding="utf-8") as f:
                f.write(metadata)

            # Save wavs
            total_wavs = len(wavs)
            clip_lengths = []
            filenames = {}
            for i in range(total_wavs):
                wav = wavs[i]
                data = z.read(wav)
                path = os.path.join(dataset_directory, "wavs",
                                    wav.split("/")[1])
                with open(path, "wb") as f:
                    f.write(data)
                    new_path = convert_audio(path)
                    clip_lengths.append(
                        librosa.get_duration(filename=new_path))
                    filenames[path] = new_path
                logging.info(f"Progress - {i+1}/{total_wavs}")

            # Get around "file in use" by using delay
            logging.info("Deleting temp files")
            for old_path, new_path in filenames.items():
                os.remove(old_path)
                os.rename(new_path, old_path)

            # Create info file
            logging.info("Creating info file")
            save_dataset_info(
                os.path.join(dataset_directory, "metadata.csv"),
                os.path.join(dataset_directory, "wavs"),
                os.path.join(dataset_directory, "info.json"),
                clip_lengths=clip_lengths,
            )
    except Exception as e:
        os.remove(dataset)
        raise e

    os.remove(dataset)