Exemplo n.º 1
0
def main():

    print(datetime.now())

    # RAW FEATURES

    # Extract MFCCs
    mfcc_dir = path.join("mfcc", "xitsonga")
    if not path.isdir(mfcc_dir):
        os.makedirs(mfcc_dir)
    output_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    if not path.isfile(output_fn):
        print("Extracting MFCCs")
        extract_features("mfcc", output_fn)
    else:
        print("Using existing file:", output_fn)

    # # Extract filterbanks
    # fbank_dir = path.join("fbank", "xitsonga")
    # if not path.isdir(fbank_dir):
    #     os.makedirs(fbank_dir)
    # output_fn = path.join(fbank_dir, "xitsonga.npz")
    # if not path.isfile(output_fn):
    #     print("Extracting filterbanks")
    #     extract_features("fbank", output_fn)
    # else:
    #     print("Using existing file:", output_fn)

    # GROUND TRUTH WORD SEGMENTS

    # Create a ground truth word list of at least 50 frames and 5 characters
    fa_fn = path.join("..", "data", "xitsonga.wrd")
    list_dir = "lists"
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    list_fn = path.join(list_dir, "xitsonga.samediff.list")
    if not path.isfile(list_fn):
        utils.write_samediff_words(fa_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract word segments from the MFCC NumPy archive
    input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    output_npz_fn = path.join(mfcc_dir, "xitsonga.samediff.dd.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for same-different word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # # Extract word segments from the filterbank NumPy archive
    # input_npz_fn = path.join(fbank_dir, "xitsonga.npz")
    # output_npz_fn = path.join(fbank_dir, "xitsonga.samediff.npz")
    # if not path.isfile(output_npz_fn):
    #     print("Extracting filterbanks for same-different word tokens")
    #     utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    # else:
    #     print("Using existing file:", output_npz_fn)

    print(datetime.now())
Exemplo n.º 2
0
def main():

    print(datetime.now())

    # RAW FEATURES

    # Extract MFCCs for the different sets
    mfcc_dir = path.join("mfcc", "buckeye")
    for subset in ["devpart1", "devpart2", "zs"]:
        if not path.isdir(mfcc_dir):
            os.makedirs(mfcc_dir)
        output_fn = path.join(mfcc_dir, subset + ".dd.npz")
        if not path.isfile(output_fn):
            print("Extracting MFCCs:", subset)
            extract_features_for_subset(subset, "mfcc", output_fn)
        else:
            print("Using existing file:", output_fn)

    # # Extract filterbanks for the different sets
    # fbank_dir = path.join("fbank", "buckeye")
    # for subset in ["devpart1", "devpart2", "zs"]:
    #     if not path.isdir(fbank_dir):
    #         os.makedirs(fbank_dir)
    #     output_fn = path.join(fbank_dir, subset + ".npz")
    #     if not path.isfile(output_fn):
    #         print("Extracting filterbanks:", subset)
    #         extract_features_for_subset(subset, "fbank", output_fn)
    #     else:
    #         print("Using existing file:", output_fn)

    # GROUND TRUTH WORD SEGMENTS

    # Create a ground truth word list of at least 50 frames and 5 characters
    fa_fn = path.join("..", "data", "buckeye_english.wrd")
    list_dir = "lists"
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    list_fn = path.join(list_dir, "buckeye.samediff.list")
    if not path.isfile(list_fn):
        utils.write_samediff_words(fa_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract word segments from the MFCC NumPy archives
    for subset in ["devpart1", "devpart2", "zs"]:
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".samediff.dd.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for same-different word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    print(datetime.now())
Exemplo n.º 3
0
def main():

    print(datetime.now())

    # RAW FEATURES

    # Extract MFCCs for the different sets
    mfcc_dir = path.join("mfcc", "buckeye")
    for subset in ["devpart1", "devpart2", "zs"]:
        if not path.isdir(mfcc_dir):
            os.makedirs(mfcc_dir)
        output_fn = path.join(mfcc_dir, subset + ".dd.npz")
        if not path.isfile(output_fn):
            print("Extracting MFCCs:", subset)
            extract_features_for_subset(subset, "mfcc", output_fn)
        else:
            print("Using existing file:", output_fn)

    # Extract filterbanks for the different sets
    fbank_dir = path.join("fbank", "buckeye")
    for subset in ["devpart1", "devpart2", "zs"]:
        if not path.isdir(fbank_dir):
            os.makedirs(fbank_dir)
        output_fn = path.join(fbank_dir, subset + ".npz")
        if not path.isfile(output_fn):
            print("Extracting filterbanks:", subset)
            extract_features_for_subset(subset, "fbank", output_fn)
        else:
            print("Using existing file:", output_fn)

    # GROUND TRUTH WORD SEGMENTS

    # Create a ground truth word list of at least 50 frames and 5 characters
    fa_fn = path.join("..", "data", "buckeye_english.wrd")
    list_dir = "lists"
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    list_fn = path.join(list_dir, "buckeye.samediff.list")
    if not path.isfile(list_fn):
        utils.write_samediff_words(fa_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract word segments from the MFCC NumPy archives
    for subset in ["devpart1", "devpart2", "zs"]:
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".samediff.dd.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for same-different word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # Extract word segments from the filterbank NumPy archives
    for subset in ["devpart1", "devpart2", "zs"]:
        input_npz_fn = path.join(fbank_dir, subset + ".npz")
        output_npz_fn = path.join(fbank_dir, subset + ".samediff.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting filterbanks for same-different word tokens:",
                  subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # Create a ground truth word list of at least 39 frames and 4 characters
    list_fn = path.join(list_dir, "buckeye.samediff2.list")
    if not path.isfile(list_fn):
        utils.write_samediff_words(fa_fn, list_fn, min_frames=39, min_chars=4)
    else:
        print("Using existing file:", list_fn)

    # Extract word segments from the MFCC NumPy archives
    for subset in ["devpart1"]:  # , "devpart2", "zs"]:
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".samediff2.dd.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for same-different word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # UTD-DISCOVERED WORD SEGMENTS

    # Remove non-VAD regions from the UTD pair list
    input_pairs_fn = path.join("..", "data", "buckeye.fdlps.0.93.pairs")
    output_pairs_fn = path.join("lists", "buckeye.utd_pairs.list")
    if not path.isfile(output_pairs_fn):
        # Read voice activity regions
        fa_fn = path.join("..", "data", "buckeye_english.wrd")
        print("Reading:", fa_fn)
        vad_dict = utils.read_vad_from_fa(fa_fn)

        # Create new pair list
        utils.strip_nonvad_from_pairs(vad_dict, input_pairs_fn,
                                      output_pairs_fn)
    else:
        print("Using existing file:", output_pairs_fn)

    # Create the UTD word list
    list_fn = path.join("lists", "buckeye.utd_terms.list")
    if not path.isfile(list_fn):
        utils.terms_from_pairs(output_pairs_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract UTD segments from the MFCC NumPy archives
    for subset in ["devpart1"]:

        # Extract pair and term list for speakers in subset
        speaker_fn = path.join("..", "data",
                               "buckeye_{}_speakers.list".format(subset))
        input_pairs_fn = output_pairs_fn
        output_pairs_fn = path.join("lists", "devpart1.utd_pairs.list")
        if not path.isfile(output_pairs_fn):
            utils.pairs_for_speakers(speaker_fn, input_pairs_fn,
                                     output_pairs_fn)
        else:
            print("Using existing file:", output_pairs_fn)
        list_fn = path.join("lists", "devpart1.utd_terms.list")
        if not path.isfile(list_fn):
            utils.terms_from_pairs(output_pairs_fn, list_fn)
        else:
            print("Using existing file:", list_fn)

        # Extract UTD segments
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".utd.dd.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for UTD word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # BES-GMM DISCOVERED WORD SEGMENTS

    for subset in ["devpart1"]:

        # All discovered words
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".besgmm.dd.npz")
        if not path.isfile(output_npz_fn):
            list_fn = path.join(
                "..", "data", "buckeye_devpart1.52e70ca864.besgmm_terms.txt")
            print("Extracting MFCCs for BES-GMM word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

        # A maximum of three pairs per class
        pairs_fn = path.join(
            "..", "data", "buckeye_devpart1.52e70ca864.besgmm_pairs_filt7.txt")
        list_fn = path.join(
            "lists", "buckeye_devpart1.52e70ca864.besgmm_terms_filt7.txt")
        if not path.isfile(list_fn):
            utils.terms_from_pairs(pairs_fn, list_fn)
        else:
            print("Using existing file:", list_fn)
        input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz")
        output_npz_fn = path.join(mfcc_dir, subset + ".besgmm7.dd.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for BES-GMM word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    print(datetime.now())
def main():
    args = check_argv()
    feat_type = "mfcc"

    list_dir = path.join("lists", args.language)
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    feat_dir = path.join(feat_type, args.language)
    if not path.isdir(feat_dir):
        os.makedirs(feat_dir)

    # All ground truth word segments with pronunciations
    for subset in ["dev"]:  #, "eval", "train"]:

        list_fn = path.join(list_dir, subset + ".all_gt_words.list")
        pronunciations_fn = path.join(list_dir, subset + ".prons")

        # Read forced alignments and obtain pronunciations
        word_fa_fn = path.join(gp_alignments_dir, args.language,
                               subset + ".ctm")
        phone_fa_fn = path.join(
            # gp_alignments_dir, args.language, subset + ".phone.ctm"
            gp_alignments_dir,
            args.language,
            subset + ".phone.ipa.ctm")
        if not path.isfile(phone_fa_fn):
            print("Warning: IPA pronunciations not found")
            phone_fa_fn = path.join(gp_alignments_dir, args.language,
                                    subset + ".phone.ctm")
        pronunciations_dict = pronunciations_from_fa(word_fa_fn, phone_fa_fn)

        # Write pronunciation list
        if not path.isfile(pronunciations_fn):
            print("Writing:", pronunciations_fn)
            with codecs.open(pronunciations_fn, "w", "utf-8") as f:
                for segment_key in sorted(pronunciations_dict):
                    f.write(segment_key + " " +
                            ",".join(pronunciations_dict[segment_key]) + "\n")
        else:
            print("Using existing file:", pronunciations_fn)

        # Write word list
        if not path.isfile(list_fn):
            print("Writing:", list_fn)
            with codecs.open(list_fn, "w", "utf-8") as f:
                for segment_key in sorted(pronunciations_dict):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", list_fn)

        # Write individual phone list
        phone_list_fn = path.join(list_dir, subset + ".phone.list")
        if not path.isfile(phone_list_fn):
            utils.filter_words(phone_fa_fn,
                               phone_list_fn,
                               min_frames=5,
                               min_chars=0)
        else:
            print("Using existing file:", phone_list_fn)

        # Filter phones
        print("Reading:", phone_list_fn)
        phone_segment_keys = []
        with codecs.open(phone_list_fn, "r", "utf-8") as f:
            for line in f:
                phone_segment_keys.append(line.strip())
        phone_filtered_keys = filter_segment_keys(phone_segment_keys,
                                                  n_max_tokens=5000)
        phone_filtered_list_fn = path.join(list_dir,
                                           subset + ".filter1_phone.list")
        print("Writing:", phone_filtered_list_fn)
        if not path.isfile(phone_filtered_list_fn):
            with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f:
                for segment_key in sorted(phone_filtered_keys):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", phone_filtered_list_fn)

        # Extract phone segments from the MFCC NumPy archives
        input_npz_fn = path.join("..", "features", feat_type, args.language,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".filter1_phone.npz")
        if not path.isfile(output_npz_fn):
            utils.segments_from_npz(input_npz_fn, phone_filtered_list_fn,
                                    output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

        if args.analyse:
            import matplotlib.pyplot as plt
            import numpy as np

            # Most common words
            labels = [i.split("_")[0] for i in pronunciations_dict]
            counter = Counter(labels)
            print("No. word types:", len(counter))
            print("No. word tokens:", len(labels))
            print("Most common words:", counter.most_common(10))

            # Histogram of word count
            counts = counter.values()
            plt.figure()
            plt.hist(counts, 50)
            plt.yscale("log")
            plt.ylabel("No. of types with this many tokens")
            plt.xlabel("No. of tokens")

            # # Temp
            # # Most common words
            # labels = [i.split("_")[0] for i in filtered_keys]
            # counter = Counter(labels)
            # print("No. word types:", len(counter))
            # print("No. word tokens:", len(labels))
            # print("Most common words:", counter.most_common(10))

            # # Histogram of word count
            # counts = counter.values()
            # plt.figure()
            # plt.hist(counts, 50)
            # plt.yscale("log")
            # plt.ylabel("No. of types with this many tokens")
            # plt.xlabel("No. of tokens")

            plt.show()

        # Filter 1
        print("Applying filter 1")
        n_min_tokens_per_type = 10
        n_max_tokens_per_type = 25
        filtered_keys = filter_segment_keys(list(pronunciations_dict),
                                            n_min_tokens_per_type,
                                            n_max_tokens_per_type)
        print("No. tokens:", len(filtered_keys))
        print("No. types:", len(set([i.split("_")[0] for i in filtered_keys])))
        filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list")
        print("Writing:", filtered_list_fn)
        if not path.isfile(filtered_list_fn):
            with codecs.open(filtered_list_fn, "w", "utf-8") as f:
                for segment_key in sorted(filtered_keys):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", filtered_list_fn)

        # Extract word segments from the MFCC NumPy archives
        input_npz_fn = path.join("..", "features", feat_type, args.language,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".filter1_gt.npz")
        if not path.isfile(output_npz_fn):
            utils.segments_from_npz(input_npz_fn, filtered_list_fn,
                                    output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)
Exemplo n.º 5
0
def main():

    print(datetime.now())

    # RAW FEATURES

    # Extract MFCCs
    mfcc_dir = path.join("mfcc", "xitsonga")
    if not path.isdir(mfcc_dir):
        os.makedirs(mfcc_dir)
    output_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    if not path.isfile(output_fn):
        print("Extracting MFCCs")
        extract_features("mfcc", output_fn)
    else:
        print("Using existing file:", output_fn)

    # Extract filterbanks
    fbank_dir = path.join("fbank", "xitsonga")
    if not path.isdir(fbank_dir):
        os.makedirs(fbank_dir)
    output_fn = path.join(fbank_dir, "xitsonga.npz")
    if not path.isfile(output_fn):
        print("Extracting filterbanks")
        extract_features("fbank", output_fn)
    else:
        print("Using existing file:", output_fn)

    # GROUND TRUTH WORD SEGMENTS

    # Create a ground truth word list of at least 50 frames and 5 characters
    fa_fn = path.join("..", "data", "xitsonga.wrd")
    list_dir = "lists"
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    list_fn = path.join(list_dir, "xitsonga.samediff.list")
    if not path.isfile(list_fn):
        utils.write_samediff_words(fa_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract word segments from the MFCC NumPy archive
    input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    output_npz_fn = path.join(mfcc_dir, "xitsonga.samediff.dd.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for same-different word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # Extract word segments from the filterbank NumPy archive
    input_npz_fn = path.join(fbank_dir, "xitsonga.npz")
    output_npz_fn = path.join(fbank_dir, "xitsonga.samediff.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting filterbanks for same-different word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # UTD-DISCOVERED WORD SEGMENTS

    # Remove non-VAD regions from the UTD pair list
    input_pairs_fn = path.join("..", "data", "zs_tsonga.fdlps.0.925.pairs.v0")
    output_pairs_fn = path.join("lists", "xitsonga.utd_pairs.list")
    if not path.isfile(output_pairs_fn):
        # Read voice activity regions
        fa_fn = path.join("..", "data", "xitsonga.wrd")
        print("Reading:", fa_fn)
        vad_dict = utils.read_vad_from_fa(fa_fn)

        # Create new pair list
        utils.strip_nonvad_from_pairs(vad_dict, input_pairs_fn,
                                      output_pairs_fn)
    else:
        print("Using existing file:", output_pairs_fn)

    # Create the UTD word list
    list_fn = path.join("lists", "xitsonga.utd_terms.list")
    if not path.isfile(list_fn):
        utils.terms_from_pairs(output_pairs_fn, list_fn)
    else:
        print("Using existing file:", list_fn)

    # Extract UTD segments from the MFCC NumPy archives
    input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    output_npz_fn = path.join(mfcc_dir, "xitsonga.utd.dd.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # BES-GMM DISCOVERED WORD SEGMENTS

    # All discovered words
    input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    output_npz_fn = path.join(mfcc_dir, "xitsonga.besgmm.dd.npz")
    if not path.isfile(output_npz_fn):
        list_fn = path.join("..", "data",
                            "buckeye_devpart1.52e70ca864.besgmm_terms.txt")
        print("Extracting MFCCs for BES-GMM word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # A maximum of three pairs per class
    pairs_fn = path.join("..", "data",
                         "xitsonga.d18547ee5e.besgmm_pairs_filt7.txt")
    list_fn = path.join("lists", "xitsonga.d18547ee5e.besgmm_pairs_filt7.txt")
    if not path.isfile(list_fn):
        utils.terms_from_pairs(pairs_fn, list_fn)
    else:
        print("Using existing file:", list_fn)
    input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz")
    output_npz_fn = path.join(mfcc_dir, "xitsonga.besgmm7.dd.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for BES-GMM word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    print(datetime.now())
Exemplo n.º 6
0
def main():
    args = check_argv()
    feat_type = "mfcc"

    # RAW FEATURES

    # Extract MFCCs for the different sets
    feat_dir = path.join(feat_type, args.language)
    if not path.isdir(feat_dir):
        os.makedirs(feat_dir)
    for subset in ["dev", "eval", "train"]:
        raw_feat_fn = path.join(feat_dir,
                                args.language.lower() + "." + subset + ".npz")
        if not path.isfile(raw_feat_fn):
            print("Extracting MFCCs:", subset)
            extract_features_for_subset(args.language, subset, feat_type,
                                        raw_feat_fn)
        else:
            print("Using existing file:", raw_feat_fn)

    # assert False

    # GROUND TRUTH WORD SEGMENTS

    list_dir = path.join("lists", args.language)
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    for subset in ["dev", "eval", "train"]:

        # Create a ground truth word list (at least 50 frames and 5 characters)
        fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm")
        list_fn = path.join(list_dir, subset + ".gt_words.list")
        if not path.isfile(list_fn):
            if args.language == "KO":
                min_frames = 26
                min_chars = 3
            elif args.language == "TH":
                min_frames = 38
                min_chars = 2
            elif args.language == "VN":
                min_frames = 30
                min_chars = 4
            else:
                min_frames = 50
                min_chars = 5
            utils.filter_words(fa_fn,
                               list_fn,
                               min_frames=min_frames,
                               min_chars=min_chars)
        else:
            print("Using existing file:", list_fn)

        # Extract word segments from the MFCC NumPy archives
        input_npz_fn = path.join(feat_dir,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".gt_words.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for ground truth word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # UTD-DISCOVERED WORD SEGMENTS

    # Change Enno Hermann's pair file to the appropriate format
    enno_pairs_fn = path.join(
        "..",
        "data",
        args.language,  # "pairs_sw_utd.train"
        "pairs_sw_utd_plp_vtln.train")
    if not path.isfile(enno_pairs_fn):
        # This might not be an evaluation language
        return
    pairs_fn = path.join("lists", args.language, "train.utd_pairs.list")
    if not path.isfile(pairs_fn):
        utils.format_enno_pairs(enno_pairs_fn, pairs_fn)
    else:
        print("Using existing file:", pairs_fn)
    list_fn = path.join("lists", args.language, "train.utd_terms.list")
    if not path.isfile(list_fn):
        print("Reading:", pairs_fn)
        terms = set()
        with codecs.open(pairs_fn, "r", "utf-8") as pairs_f:
            for line in pairs_f:
                term1, term2 = line.strip().split(" ")
                terms.add(term1)
                terms.add(term2)
        print("Writing:", list_fn)
        with codecs.open(list_fn, "w", "utf-8") as list_f:
            for term in sorted(terms):
                list_f.write(term + "\n")
    else:
        print("Using existing file:", list_fn)

    # Extract UTD segments
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(feat_dir,
                              args.language.lower() + ".train.utd_terms.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # UTD SEGMENTS THAT HAVE BEEN PARTIALLY FIXED

    # Write list with fixed labels and segments
    fixed_labels_list_fn = path.join("lists", args.language,
                                     "train.utd_terms.fixed_labels.list")
    fixed_segs_list_fn = path.join("lists", args.language,
                                   "train.utd_terms.fixed_segs.list")
    fixed_labels_segs_list_fn = path.join(
        "lists", args.language, "train.utd_terms.fixed_labels_segs.list")
    if (not path.isfile(fixed_labels_list_fn)
            or not path.isfile(fixed_labels_segs_list_fn)
            or not path.isfile(fixed_segs_list_fn)):

        # Read UTD terms
        utd_list_fn = path.join("lists", args.language, "train.utd_terms.list")
        print("Reading:", utd_list_fn)
        # overlap_dict[speaker_utt][(start, end)] is a tuple of
        # (label, (start, end), overlap, cluster_label)
        overlap_dict = {}
        with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f:
            for line in utd_list_f:
                term, speaker, utt, start_end = line.strip().split("_")
                start, end = start_end.split("-")
                start = int(start)
                end = int(end)
                if not speaker + "_" + utt in overlap_dict:
                    overlap_dict[speaker + "_" + utt] = {}
                overlap_dict[speaker + "_" + utt][(start,
                                                   end)] = ("label", (0, 0), 0,
                                                            term)

        # Read forced alignments
        fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm")
        print("Reading:", fa_fn)
        fa_dict = {}
        with codecs.open(fa_fn, "r", "utf-8") as fa_f:
            for line in fa_f:
                utt_key, _, start, duration, label = line.strip().split()
                start = float(start)
                duration = float(duration)
                end = start + duration
                start_frame = int(round(start * 100))
                end_frame = int(round(end * 100))
                if (label != "<unk>" and label != "sil" and label != "?"
                        and label != "spn"):
                    if not utt_key in fa_dict:
                        fa_dict[utt_key] = {}
                    fa_dict[utt_key][start_frame, end_frame] = label

        # Find ground truth terms with maximal overlap
        print("Getting ground truth terms with maximal overlap:")
        for utt_key in tqdm(fa_dict):
            # print(utt_key)
            if utt_key not in overlap_dict:
                continue
            for (fa_start, fa_end) in fa_dict[utt_key]:
                for (utd_start, utd_end) in overlap_dict[utt_key]:
                    overlap = get_overlap(utd_start, utd_end, fa_start, fa_end)
                    if overlap == 0:
                        continue
                    if (overlap >
                            overlap_dict[utt_key][(utd_start, utd_end)][2]):
                        overlap_dict[utt_key][(utd_start, utd_end)] = (
                            fa_dict[utt_key][(fa_start, fa_end)],
                            (fa_start, fa_end), overlap,
                            overlap_dict[utt_key][(utd_start, utd_end)][3])

        # Write list with fixed labels
        if not path.isfile(fixed_labels_list_fn):
            print("Writing:", fixed_labels_list_fn)
            with codecs.open(fixed_labels_list_fn, "w", "utf-8") as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][0]
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, utd_start, utd_end))
        else:
            print("Using existing file:", fixed_labels_list_fn)

        # Write list with fixed labels and segment intervals
        if not path.isfile(fixed_labels_segs_list_fn):
            print("Writing:", fixed_labels_segs_list_fn)
            with (codecs.open(fixed_labels_segs_list_fn, "w",
                              "utf-8")) as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][0]
                        fa_start, fa_end = (overlap_dict[utt_key][(
                            utd_start, utd_end)][1])
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, fa_start, fa_end))
        else:
            print("Using existing file:", fixed_labels_segs_list_fn)

        # Write list with fixed segment intervals
        if not path.isfile(fixed_segs_list_fn):
            print("Writing:", fixed_segs_list_fn)
            with (codecs.open(fixed_segs_list_fn, "w", "utf-8")) as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][3]
                        fa_start, fa_end = (overlap_dict[utt_key][(
                            utd_start, utd_end)][1])
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, fa_start, fa_end))
        else:
            print("Using existing file:", fixed_segs_list_fn)

    else:
        print("Using existing file:", fixed_labels_list_fn)
        print("Using existing file:", fixed_segs_list_fn)
        print("Using existing file:", fixed_labels_segs_list_fn)

    # Extract UTD with fixed labels
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_labels.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels")
        utils.segments_from_npz(input_npz_fn, fixed_labels_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # Extract UTD with fixed segment intervals
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_segs.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels and segment "
              "intervals")
        utils.segments_from_npz(input_npz_fn, fixed_segs_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # Extract UTD with fixed labels and segment intervals
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_labels_segs.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels and segment "
              "intervals")
        utils.segments_from_npz(input_npz_fn, fixed_labels_segs_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)