def prepare_librispeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    create_lexicon=False,
    skip_prep=False,
):
    """
    This class prepares the csv files for the LibriSpeech dataset.
    Download link: http://www.openslr.org/12

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original LibriSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['test-others','train-clean-100',
        'train-clean-360','train-other-500'].
    dev_splits : list
        List of dev splits to prepare from ['dev-clean','dev-others'].
    te_splits : list
        List of test splits to prepare from ['test-clean','test-others'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of librispeech splits (e.g, train-clean, train-clean-360,..) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    create_lexicon: bool
        If True, it outputs csv files contaning mapping between graphene
        to phonemes. Use it for training a G2P system.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/LibriSpeech'
    >>> splits = ['train-clean-100', 'dev-clean', 'test-clean']
    >>> save_folder = 'librispeech_prepared'
    >>> prepare_librispeech(data_folder, splits, save_folder)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains Librispeech
    check_librispeech_folders(data_folder, splits)

    # create csv files for each split
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]

        wav_lst = get_all_files(os.path.join(data_folder, split),
                                match_and=[".flac"])

        text_lst = get_all_files(os.path.join(data_folder, split),
                                 match_and=["trans.txt"])

        text_dict = text_to_dict(text_lst)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_libri + ".csv" for split_libri in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # Create lexicon.csv and oov.csv
    if create_lexicon:
        create_lexicon_and_oov_csv(all_texts, data_folder, save_folder)

    # saving options
    save_pkl(conf, save_opt)
示例#2
0
def prepare_TAS(data_folder, save_folder, type, train_splits, skip_prep=False):
    """
    This function prepares the Timers and Such dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to Timers and Such dataset.
    save_folder: path there to save the csv manifest files.
    type : one of the following:

      "direct":{input=audio, output=semantics}
      "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle)
      "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts)

    train_splits : list of splits to be joined to form train .csv
    skip_prep: If True, skip data preparation

    """
    if skip_prep:
        return
    if type == "decoupled":
        try:
            import inflect

            p = inflect.engine()
        except ModuleNotFoundError:
            logger.info(
                'Error: the inflect module must be installed to run the "decoupled" SLU recipe.'
            )
            logger.info("Install using `pip install inflect`.")
            raise

    # If the data folders do not exist, we need to extract the data
    if not os.path.isdir(os.path.join(data_folder, "train-synth")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "timers-and-such.zip")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4623772/files/timers-and-such-v1.0.zip?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            logger.info("Extracting timers-and-such.zip...")
            shutil.unpack_archive(zip_location, data_folder)

    splits = [
        "train-real",
        "dev-real",
        "test-real",
        "train-synth",
        "dev-synth",
        "test-synth",
    ]
    ID_start = 0  # needed to have a unique ID for each audio
    for split in splits:
        new_filename = os.path.join(save_folder, split) + "-type=%s.csv" % type
        if os.path.exists(new_filename):
            continue
        logger.info("Preparing %s..." % new_filename)

        ID = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        spk_id = []
        spk_id_format = []
        spk_id_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        df = pd.read_csv(os.path.join(data_folder, split) + ".csv")
        for i in range(len(df)):
            ID.append(ID_start + i)
            signal = read_audio(os.path.join(data_folder, df.path[i]))
            duration.append(signal.shape[0] / 16000)

            wav.append(os.path.join(data_folder, df.path[i]))
            wav_format.append("wav")
            wav_opts.append(None)

            spk_id.append(df.speakerId[i])
            spk_id_format.append("string")
            spk_id_opts.append(None)

            transcript_ = df.transcription[i]
            if type == "decoupled":
                words = transcript_.split()
                for w in range(len(words)):
                    words[w] = words[w].upper()
                    # If the word is numeric, we need to convert it to letters, to match what the ASR would output.
                    if any(c.isdigit() for c in words[w]):
                        if "AM" in words[w] or "PM" in words[w]:
                            AM_or_PM = "A M" if "AM" in words[w] else "P M"
                            if ":" in words[w]:
                                hour = words[w].split(":")[0]
                                minute = (
                                    words[w].split(":")[1].split("AM")[0]
                                    if "AM" in words[w] else
                                    words[w].split(":")[1].split("PM")[0])
                                words[w] = (p.number_to_words(hour).upper() +
                                            " " +
                                            p.number_to_words(minute).upper() +
                                            " " + AM_or_PM)
                            else:
                                hour = (words[w].split("AM")[0]
                                        if "AM" in words[w] else
                                        words[w].split("PM")[0])
                                words[w] = (p.number_to_words(hour).upper() +
                                            " " + AM_or_PM)
                        else:
                            words[w] = p.number_to_words(words[w]).upper()
                transcript_ = " ".join(words).replace("-", " ")

            transcript.append(transcript_)
            transcript_format.append("string")
            transcript_opts.append(None)

            semantics_ = df.semantics[i].replace(
                ".3333333333333333",
                ".33")  # Fix formatting error in some labels
            if type == "direct" or type == "multistage" or type == "decoupled":
                semantics.append(semantics_)
            if type == "joint-transcript-semantics":
                semantics.append("{'transcript': '" + transcript_ + "'| " +
                                 semantics_[1:])
            if type == "joint-semantics-transcript":
                semantics.append(semantics_[:-1] + "| 'transcript': '" +
                                 transcript_ + "'}")
            semantics_format.append("string")
            semantics_opts.append(None)

        new_df = pd.DataFrame({
            "ID": ID,
            "duration": duration,
            "wav": wav,
            "spk_id": spk_id,
            "semantics": semantics,
            "transcript": transcript,
        })
        new_df.to_csv(new_filename, index=False)
        ID_start += len(df)

    # Merge train splits
    train_splits = [split + "-type=%s.csv" % type for split in train_splits]
    merge_csvs(save_folder, train_splits, "train-type=%s.csv" % type)

    # Create "all-real" split
    real_splits = [
        split + "-type=%s.csv" % type
        for split in ["train-real", "dev-real", "test-real"]
    ]
    merge_csvs(save_folder, real_splits, "all-real-type=%s.csv" % type)
def prepare_ksponspeech(
    data_folder,
    save_folder,
    tr_splits=[],
    dev_splits=[],
    te_splits=[],
    select_n_sentences=None,
    merge_lst=[],
    merge_name=None,
    skip_prep=False,
):
    """
    This class prepares the csv files for the KsponSpeech dataset.
    Download link: https://aihub.or.kr/aidata/105/download

    Arguments
    ---------
    data_folder : str
        Path to the folder where the original KsponSpeech dataset is stored.
    tr_splits : list
        List of train splits to prepare from ['train', 'dev', 'eval_clean',
        'eval_other'].
    dev_splits : list
        List of dev splits to prepare from ['dev'].
    te_splits : list
        List of test splits to prepare from ['eval_clean','eval_other'].
    save_folder : str
        The directory where to store the csv files.
    select_n_sentences : int
        Default : None
        If not None, only pick this many sentences.
    merge_lst : list
        List of KsponSpeech splits (e.g, eval_clean, eval_other) to
        merge in a singe csv file.
    merge_name: str
        Name of the merged csv file.
    skip_prep: bool
        If True, data preparation is skipped.


    Example
    -------
    >>> data_folder = 'datasets/KsponSpeech'
    >>> tr_splits = ['train']
    >>> dev_splits = ['dev']
    >>> te_splits = ['eval_clean']
    >>> save_folder = 'KsponSpeech_prepared'
    >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \
                            te_splits)
    """

    if skip_prep:
        return
    data_folder = data_folder
    splits = tr_splits + dev_splits + te_splits
    save_folder = save_folder
    select_n_sentences = select_n_sentences
    conf = {
        "select_n_sentences": select_n_sentences,
    }

    # Other variables
    # Saving folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_opt = os.path.join(save_folder, OPT_FILE)

    # Check if this phase is already done (if so, skip it)
    if skip(splits, save_folder, conf):
        logger.info("Skipping preparation, completed in previous run.")
        return
    else:
        logger.info("Data_preparation...")

    # Additional checks to make sure the data folder contains ksponspeech
    check_ksponspeech_folders(data_folder, splits)

    # parse trn file
    all_texts = {}
    for split_index in range(len(splits)):

        split = splits[split_index]
        dirlist = split2dirs(split)
        wav_lst = []
        for dir in dirlist:
            wav_lst += get_all_files(os.path.join(data_folder, dir),
                                     match_and=[".wav"])

        trnpath = os.path.join(data_folder, split + ".trn")
        text_dict = text_to_dict(trnpath)
        all_texts.update(text_dict)

        if select_n_sentences is not None:
            n_sentences = select_n_sentences[split_index]
        else:
            n_sentences = len(wav_lst)

        create_csv(
            save_folder,
            wav_lst,
            text_dict,
            split,
            n_sentences,
        )

    # Merging csv file if needed
    if merge_lst and merge_name is not None:
        merge_files = [split_kspon + ".csv" for split_kspon in merge_lst]
        merge_csvs(
            data_folder=save_folder,
            csv_lst=merge_files,
            merged_csv=merge_name,
        )

    # saving options
    save_pkl(conf, save_opt)
示例#4
0
def prepare_SLURP(data_folder, slu_type, train_splits):
    """
    This function prepares the SLURP dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to SLURP dataset.
    slu_type : one of the following:

      "direct":{input=audio, output=semantics}
      "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle)
      "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts)

    train_splits : list of splits to be joined to form train .csv
    """

    # If the data folders do not exist, we need to extract the data
    # if not os.path.isdir(os.path.join(data_folder, "train-synth")):
    # Check for zip file and download if it doesn't exist
    #    zip_location = os.path.join(data_folder, "timers-and-such.zip")
    #    if not os.path.exists(zip_location):
    #        url = "https://zenodo.org/record/4110812/files/timers-and-such.zip?download=1"
    #        download_file(url, zip_location, unpack=True)
    #    else:
    #        print("Extracting timers-and-such.zip...")
    #        shutil.unpack_archive(zip_location, data_folder)

    splits = [
        "train",
        "train_synthetic",
        "devel",
        "test",
    ]
    id = 0
    for split in splits:
        new_filename = (os.path.join(data_folder, split) +
                        "-type=%s.csv" % slu_type)
        if os.path.exists(new_filename):
            continue
        print("Preparing %s..." % new_filename)

        IDs = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        with jsonlines.open(
                os.path.join(data_folder,
                             "dataset/slurp/" + split + ".jsonl")) as reader:
            for obj in reader:
                scenario = obj["scenario"]
                action = obj["action"]
                sentence_annotation = obj["sentence_annotation"]
                num_entities = sentence_annotation.count("[")
                entities = []
                for slot in range(num_entities):
                    type = (sentence_annotation.split("[")[slot + 1].split("]")
                            [0].split(":")[0].strip())
                    filler = (sentence_annotation.split("[")[slot + 1].split(
                        "]")[0].split(":")[1].strip())
                    entities.append({"type": type, "filler": filler})
                for recording in obj["recordings"]:
                    IDs.append(id)
                    if "synthetic" in split:
                        audio_folder = "scripts/audio/slurp_synth/"
                    else:
                        audio_folder = "scripts/audio/slurp_real/"
                    path = os.path.join(data_folder, audio_folder,
                                        recording["file"])
                    signal = read_wav_soundfile(path)
                    duration.append(signal.shape[0] / 16000)

                    wav.append(path)
                    wav_format.append("flac")
                    wav_opts.append(None)

                    transcript_ = obj["sentence"]
                    if slu_type == "decoupled":
                        transcript_ = transcript_.upper()
                    transcript.append(transcript_)
                    transcript_format.append("string")
                    transcript_opts.append(None)

                    semantics_dict = {
                        "scenario": scenario,
                        "action": action,
                        "entities": entities,
                    }
                    semantics_ = str(semantics_dict).replace(
                        ",", "|"
                    )  # Commas in dict will make using csv files tricky; replace with pipe.
                    semantics.append(semantics_)
                    semantics_format.append("string")
                    semantics_opts.append(None)
                    id += 1

        df = pd.DataFrame({
            "ID": IDs,
            "duration": duration,
            "wav": wav,
            "wav_format": wav_format,
            "wav_opts": wav_opts,
            "semantics": semantics,
            "semantics_format": semantics_format,
            "semantics_opts": semantics_opts,
            "transcript": transcript,
            "transcript_format": transcript_format,
            "transcript_opts": transcript_opts,
        })
        df.to_csv(new_filename, index=False)

    # Merge train splits
    train_splits = [
        split + "-type=%s.csv" % slu_type for split in train_splits
    ]
    merge_csvs(data_folder, train_splits, "train-type=%s.csv" % slu_type)