def main(): print(datetime.now()) # RAW FEATURES # Extract MFCCs mfcc_dir = path.join("mfcc", "xitsonga") if not path.isdir(mfcc_dir): os.makedirs(mfcc_dir) output_fn = path.join(mfcc_dir, "xitsonga.dd.npz") if not path.isfile(output_fn): print("Extracting MFCCs") extract_features("mfcc", output_fn) else: print("Using existing file:", output_fn) # # Extract filterbanks # fbank_dir = path.join("fbank", "xitsonga") # if not path.isdir(fbank_dir): # os.makedirs(fbank_dir) # output_fn = path.join(fbank_dir, "xitsonga.npz") # if not path.isfile(output_fn): # print("Extracting filterbanks") # extract_features("fbank", output_fn) # else: # print("Using existing file:", output_fn) # GROUND TRUTH WORD SEGMENTS # Create a ground truth word list of at least 50 frames and 5 characters fa_fn = path.join("..", "data", "xitsonga.wrd") list_dir = "lists" if not path.isdir(list_dir): os.makedirs(list_dir) list_fn = path.join(list_dir, "xitsonga.samediff.list") if not path.isfile(list_fn): utils.write_samediff_words(fa_fn, list_fn) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archive input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz") output_npz_fn = path.join(mfcc_dir, "xitsonga.samediff.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for same-different word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # # Extract word segments from the filterbank NumPy archive # input_npz_fn = path.join(fbank_dir, "xitsonga.npz") # output_npz_fn = path.join(fbank_dir, "xitsonga.samediff.npz") # if not path.isfile(output_npz_fn): # print("Extracting filterbanks for same-different word tokens") # utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) # else: # print("Using existing file:", output_npz_fn) print(datetime.now())
def main(): print(datetime.now()) # RAW FEATURES # Extract MFCCs for the different sets mfcc_dir = path.join("mfcc", "buckeye") for subset in ["devpart1", "devpart2", "zs"]: if not path.isdir(mfcc_dir): os.makedirs(mfcc_dir) output_fn = path.join(mfcc_dir, subset + ".dd.npz") if not path.isfile(output_fn): print("Extracting MFCCs:", subset) extract_features_for_subset(subset, "mfcc", output_fn) else: print("Using existing file:", output_fn) # # Extract filterbanks for the different sets # fbank_dir = path.join("fbank", "buckeye") # for subset in ["devpart1", "devpart2", "zs"]: # if not path.isdir(fbank_dir): # os.makedirs(fbank_dir) # output_fn = path.join(fbank_dir, subset + ".npz") # if not path.isfile(output_fn): # print("Extracting filterbanks:", subset) # extract_features_for_subset(subset, "fbank", output_fn) # else: # print("Using existing file:", output_fn) # GROUND TRUTH WORD SEGMENTS # Create a ground truth word list of at least 50 frames and 5 characters fa_fn = path.join("..", "data", "buckeye_english.wrd") list_dir = "lists" if not path.isdir(list_dir): os.makedirs(list_dir) list_fn = path.join(list_dir, "buckeye.samediff.list") if not path.isfile(list_fn): utils.write_samediff_words(fa_fn, list_fn) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archives for subset in ["devpart1", "devpart2", "zs"]: input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".samediff.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for same-different word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) print(datetime.now())
def main(): print(datetime.now()) # RAW FEATURES # Extract MFCCs for the different sets mfcc_dir = path.join("mfcc", "buckeye") for subset in ["devpart1", "devpart2", "zs"]: if not path.isdir(mfcc_dir): os.makedirs(mfcc_dir) output_fn = path.join(mfcc_dir, subset + ".dd.npz") if not path.isfile(output_fn): print("Extracting MFCCs:", subset) extract_features_for_subset(subset, "mfcc", output_fn) else: print("Using existing file:", output_fn) # Extract filterbanks for the different sets fbank_dir = path.join("fbank", "buckeye") for subset in ["devpart1", "devpart2", "zs"]: if not path.isdir(fbank_dir): os.makedirs(fbank_dir) output_fn = path.join(fbank_dir, subset + ".npz") if not path.isfile(output_fn): print("Extracting filterbanks:", subset) extract_features_for_subset(subset, "fbank", output_fn) else: print("Using existing file:", output_fn) # GROUND TRUTH WORD SEGMENTS # Create a ground truth word list of at least 50 frames and 5 characters fa_fn = path.join("..", "data", "buckeye_english.wrd") list_dir = "lists" if not path.isdir(list_dir): os.makedirs(list_dir) list_fn = path.join(list_dir, "buckeye.samediff.list") if not path.isfile(list_fn): utils.write_samediff_words(fa_fn, list_fn) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archives for subset in ["devpart1", "devpart2", "zs"]: input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".samediff.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for same-different word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract word segments from the filterbank NumPy archives for subset in ["devpart1", "devpart2", "zs"]: input_npz_fn = path.join(fbank_dir, subset + ".npz") output_npz_fn = path.join(fbank_dir, subset + ".samediff.npz") if not path.isfile(output_npz_fn): print("Extracting filterbanks for same-different word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Create a ground truth word list of at least 39 frames and 4 characters list_fn = path.join(list_dir, "buckeye.samediff2.list") if not path.isfile(list_fn): utils.write_samediff_words(fa_fn, list_fn, min_frames=39, min_chars=4) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archives for subset in ["devpart1"]: # , "devpart2", "zs"]: input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".samediff2.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for same-different word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD-DISCOVERED WORD SEGMENTS # Remove non-VAD regions from the UTD pair list input_pairs_fn = path.join("..", "data", "buckeye.fdlps.0.93.pairs") output_pairs_fn = path.join("lists", "buckeye.utd_pairs.list") if not path.isfile(output_pairs_fn): # Read voice activity regions fa_fn = path.join("..", "data", "buckeye_english.wrd") print("Reading:", fa_fn) vad_dict = utils.read_vad_from_fa(fa_fn) # Create new pair list utils.strip_nonvad_from_pairs(vad_dict, input_pairs_fn, output_pairs_fn) else: print("Using existing file:", output_pairs_fn) # Create the UTD word list list_fn = path.join("lists", "buckeye.utd_terms.list") if not path.isfile(list_fn): utils.terms_from_pairs(output_pairs_fn, list_fn) else: print("Using existing file:", list_fn) # Extract UTD segments from the MFCC NumPy archives for subset in ["devpart1"]: # Extract pair and term list for speakers in subset speaker_fn = path.join("..", "data", "buckeye_{}_speakers.list".format(subset)) input_pairs_fn = output_pairs_fn output_pairs_fn = path.join("lists", "devpart1.utd_pairs.list") if not path.isfile(output_pairs_fn): utils.pairs_for_speakers(speaker_fn, input_pairs_fn, output_pairs_fn) else: print("Using existing file:", output_pairs_fn) list_fn = path.join("lists", "devpart1.utd_terms.list") if not path.isfile(list_fn): utils.terms_from_pairs(output_pairs_fn, list_fn) else: print("Using existing file:", list_fn) # Extract UTD segments input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".utd.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # BES-GMM DISCOVERED WORD SEGMENTS for subset in ["devpart1"]: # All discovered words input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".besgmm.dd.npz") if not path.isfile(output_npz_fn): list_fn = path.join( "..", "data", "buckeye_devpart1.52e70ca864.besgmm_terms.txt") print("Extracting MFCCs for BES-GMM word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # A maximum of three pairs per class pairs_fn = path.join( "..", "data", "buckeye_devpart1.52e70ca864.besgmm_pairs_filt7.txt") list_fn = path.join( "lists", "buckeye_devpart1.52e70ca864.besgmm_terms_filt7.txt") if not path.isfile(list_fn): utils.terms_from_pairs(pairs_fn, list_fn) else: print("Using existing file:", list_fn) input_npz_fn = path.join(mfcc_dir, subset + ".dd.npz") output_npz_fn = path.join(mfcc_dir, subset + ".besgmm7.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for BES-GMM word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) print(datetime.now())
def main(): args = check_argv() feat_type = "mfcc" list_dir = path.join("lists", args.language) if not path.isdir(list_dir): os.makedirs(list_dir) feat_dir = path.join(feat_type, args.language) if not path.isdir(feat_dir): os.makedirs(feat_dir) # All ground truth word segments with pronunciations for subset in ["dev"]: #, "eval", "train"]: list_fn = path.join(list_dir, subset + ".all_gt_words.list") pronunciations_fn = path.join(list_dir, subset + ".prons") # Read forced alignments and obtain pronunciations word_fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") phone_fa_fn = path.join( # gp_alignments_dir, args.language, subset + ".phone.ctm" gp_alignments_dir, args.language, subset + ".phone.ipa.ctm") if not path.isfile(phone_fa_fn): print("Warning: IPA pronunciations not found") phone_fa_fn = path.join(gp_alignments_dir, args.language, subset + ".phone.ctm") pronunciations_dict = pronunciations_from_fa(word_fa_fn, phone_fa_fn) # Write pronunciation list if not path.isfile(pronunciations_fn): print("Writing:", pronunciations_fn) with codecs.open(pronunciations_fn, "w", "utf-8") as f: for segment_key in sorted(pronunciations_dict): f.write(segment_key + " " + ",".join(pronunciations_dict[segment_key]) + "\n") else: print("Using existing file:", pronunciations_fn) # Write word list if not path.isfile(list_fn): print("Writing:", list_fn) with codecs.open(list_fn, "w", "utf-8") as f: for segment_key in sorted(pronunciations_dict): f.write(segment_key + "\n") else: print("Using existing file:", list_fn) # Write individual phone list phone_list_fn = path.join(list_dir, subset + ".phone.list") if not path.isfile(phone_list_fn): utils.filter_words(phone_fa_fn, phone_list_fn, min_frames=5, min_chars=0) else: print("Using existing file:", phone_list_fn) # Filter phones print("Reading:", phone_list_fn) phone_segment_keys = [] with codecs.open(phone_list_fn, "r", "utf-8") as f: for line in f: phone_segment_keys.append(line.strip()) phone_filtered_keys = filter_segment_keys(phone_segment_keys, n_max_tokens=5000) phone_filtered_list_fn = path.join(list_dir, subset + ".filter1_phone.list") print("Writing:", phone_filtered_list_fn) if not path.isfile(phone_filtered_list_fn): with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f: for segment_key in sorted(phone_filtered_keys): f.write(segment_key + "\n") else: print("Using existing file:", phone_filtered_list_fn) # Extract phone segments from the MFCC NumPy archives input_npz_fn = path.join("..", "features", feat_type, args.language, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".filter1_phone.npz") if not path.isfile(output_npz_fn): utils.segments_from_npz(input_npz_fn, phone_filtered_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) if args.analyse: import matplotlib.pyplot as plt import numpy as np # Most common words labels = [i.split("_")[0] for i in pronunciations_dict] counter = Counter(labels) print("No. word types:", len(counter)) print("No. word tokens:", len(labels)) print("Most common words:", counter.most_common(10)) # Histogram of word count counts = counter.values() plt.figure() plt.hist(counts, 50) plt.yscale("log") plt.ylabel("No. of types with this many tokens") plt.xlabel("No. of tokens") # # Temp # # Most common words # labels = [i.split("_")[0] for i in filtered_keys] # counter = Counter(labels) # print("No. word types:", len(counter)) # print("No. word tokens:", len(labels)) # print("Most common words:", counter.most_common(10)) # # Histogram of word count # counts = counter.values() # plt.figure() # plt.hist(counts, 50) # plt.yscale("log") # plt.ylabel("No. of types with this many tokens") # plt.xlabel("No. of tokens") plt.show() # Filter 1 print("Applying filter 1") n_min_tokens_per_type = 10 n_max_tokens_per_type = 25 filtered_keys = filter_segment_keys(list(pronunciations_dict), n_min_tokens_per_type, n_max_tokens_per_type) print("No. tokens:", len(filtered_keys)) print("No. types:", len(set([i.split("_")[0] for i in filtered_keys]))) filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list") print("Writing:", filtered_list_fn) if not path.isfile(filtered_list_fn): with codecs.open(filtered_list_fn, "w", "utf-8") as f: for segment_key in sorted(filtered_keys): f.write(segment_key + "\n") else: print("Using existing file:", filtered_list_fn) # Extract word segments from the MFCC NumPy archives input_npz_fn = path.join("..", "features", feat_type, args.language, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".filter1_gt.npz") if not path.isfile(output_npz_fn): utils.segments_from_npz(input_npz_fn, filtered_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn)
def main(): print(datetime.now()) # RAW FEATURES # Extract MFCCs mfcc_dir = path.join("mfcc", "xitsonga") if not path.isdir(mfcc_dir): os.makedirs(mfcc_dir) output_fn = path.join(mfcc_dir, "xitsonga.dd.npz") if not path.isfile(output_fn): print("Extracting MFCCs") extract_features("mfcc", output_fn) else: print("Using existing file:", output_fn) # Extract filterbanks fbank_dir = path.join("fbank", "xitsonga") if not path.isdir(fbank_dir): os.makedirs(fbank_dir) output_fn = path.join(fbank_dir, "xitsonga.npz") if not path.isfile(output_fn): print("Extracting filterbanks") extract_features("fbank", output_fn) else: print("Using existing file:", output_fn) # GROUND TRUTH WORD SEGMENTS # Create a ground truth word list of at least 50 frames and 5 characters fa_fn = path.join("..", "data", "xitsonga.wrd") list_dir = "lists" if not path.isdir(list_dir): os.makedirs(list_dir) list_fn = path.join(list_dir, "xitsonga.samediff.list") if not path.isfile(list_fn): utils.write_samediff_words(fa_fn, list_fn) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archive input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz") output_npz_fn = path.join(mfcc_dir, "xitsonga.samediff.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for same-different word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract word segments from the filterbank NumPy archive input_npz_fn = path.join(fbank_dir, "xitsonga.npz") output_npz_fn = path.join(fbank_dir, "xitsonga.samediff.npz") if not path.isfile(output_npz_fn): print("Extracting filterbanks for same-different word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD-DISCOVERED WORD SEGMENTS # Remove non-VAD regions from the UTD pair list input_pairs_fn = path.join("..", "data", "zs_tsonga.fdlps.0.925.pairs.v0") output_pairs_fn = path.join("lists", "xitsonga.utd_pairs.list") if not path.isfile(output_pairs_fn): # Read voice activity regions fa_fn = path.join("..", "data", "xitsonga.wrd") print("Reading:", fa_fn) vad_dict = utils.read_vad_from_fa(fa_fn) # Create new pair list utils.strip_nonvad_from_pairs(vad_dict, input_pairs_fn, output_pairs_fn) else: print("Using existing file:", output_pairs_fn) # Create the UTD word list list_fn = path.join("lists", "xitsonga.utd_terms.list") if not path.isfile(list_fn): utils.terms_from_pairs(output_pairs_fn, list_fn) else: print("Using existing file:", list_fn) # Extract UTD segments from the MFCC NumPy archives input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz") output_npz_fn = path.join(mfcc_dir, "xitsonga.utd.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # BES-GMM DISCOVERED WORD SEGMENTS # All discovered words input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz") output_npz_fn = path.join(mfcc_dir, "xitsonga.besgmm.dd.npz") if not path.isfile(output_npz_fn): list_fn = path.join("..", "data", "buckeye_devpart1.52e70ca864.besgmm_terms.txt") print("Extracting MFCCs for BES-GMM word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # A maximum of three pairs per class pairs_fn = path.join("..", "data", "xitsonga.d18547ee5e.besgmm_pairs_filt7.txt") list_fn = path.join("lists", "xitsonga.d18547ee5e.besgmm_pairs_filt7.txt") if not path.isfile(list_fn): utils.terms_from_pairs(pairs_fn, list_fn) else: print("Using existing file:", list_fn) input_npz_fn = path.join(mfcc_dir, "xitsonga.dd.npz") output_npz_fn = path.join(mfcc_dir, "xitsonga.besgmm7.dd.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for BES-GMM word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) print(datetime.now())
def main(): args = check_argv() feat_type = "mfcc" # RAW FEATURES # Extract MFCCs for the different sets feat_dir = path.join(feat_type, args.language) if not path.isdir(feat_dir): os.makedirs(feat_dir) for subset in ["dev", "eval", "train"]: raw_feat_fn = path.join(feat_dir, args.language.lower() + "." + subset + ".npz") if not path.isfile(raw_feat_fn): print("Extracting MFCCs:", subset) extract_features_for_subset(args.language, subset, feat_type, raw_feat_fn) else: print("Using existing file:", raw_feat_fn) # assert False # GROUND TRUTH WORD SEGMENTS list_dir = path.join("lists", args.language) if not path.isdir(list_dir): os.makedirs(list_dir) for subset in ["dev", "eval", "train"]: # Create a ground truth word list (at least 50 frames and 5 characters) fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") list_fn = path.join(list_dir, subset + ".gt_words.list") if not path.isfile(list_fn): if args.language == "KO": min_frames = 26 min_chars = 3 elif args.language == "TH": min_frames = 38 min_chars = 2 elif args.language == "VN": min_frames = 30 min_chars = 4 else: min_frames = 50 min_chars = 5 utils.filter_words(fa_fn, list_fn, min_frames=min_frames, min_chars=min_chars) else: print("Using existing file:", list_fn) # Extract word segments from the MFCC NumPy archives input_npz_fn = path.join(feat_dir, args.language.lower() + "." + subset + ".npz") output_npz_fn = path.join( feat_dir, args.language.lower() + "." + subset + ".gt_words.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for ground truth word tokens:", subset) utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD-DISCOVERED WORD SEGMENTS # Change Enno Hermann's pair file to the appropriate format enno_pairs_fn = path.join( "..", "data", args.language, # "pairs_sw_utd.train" "pairs_sw_utd_plp_vtln.train") if not path.isfile(enno_pairs_fn): # This might not be an evaluation language return pairs_fn = path.join("lists", args.language, "train.utd_pairs.list") if not path.isfile(pairs_fn): utils.format_enno_pairs(enno_pairs_fn, pairs_fn) else: print("Using existing file:", pairs_fn) list_fn = path.join("lists", args.language, "train.utd_terms.list") if not path.isfile(list_fn): print("Reading:", pairs_fn) terms = set() with codecs.open(pairs_fn, "r", "utf-8") as pairs_f: for line in pairs_f: term1, term2 = line.strip().split(" ") terms.add(term1) terms.add(term2) print("Writing:", list_fn) with codecs.open(list_fn, "w", "utf-8") as list_f: for term in sorted(terms): list_f.write(term + "\n") else: print("Using existing file:", list_fn) # Extract UTD segments input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join(feat_dir, args.language.lower() + ".train.utd_terms.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD word tokens") utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # UTD SEGMENTS THAT HAVE BEEN PARTIALLY FIXED # Write list with fixed labels and segments fixed_labels_list_fn = path.join("lists", args.language, "train.utd_terms.fixed_labels.list") fixed_segs_list_fn = path.join("lists", args.language, "train.utd_terms.fixed_segs.list") fixed_labels_segs_list_fn = path.join( "lists", args.language, "train.utd_terms.fixed_labels_segs.list") if (not path.isfile(fixed_labels_list_fn) or not path.isfile(fixed_labels_segs_list_fn) or not path.isfile(fixed_segs_list_fn)): # Read UTD terms utd_list_fn = path.join("lists", args.language, "train.utd_terms.list") print("Reading:", utd_list_fn) # overlap_dict[speaker_utt][(start, end)] is a tuple of # (label, (start, end), overlap, cluster_label) overlap_dict = {} with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f: for line in utd_list_f: term, speaker, utt, start_end = line.strip().split("_") start, end = start_end.split("-") start = int(start) end = int(end) if not speaker + "_" + utt in overlap_dict: overlap_dict[speaker + "_" + utt] = {} overlap_dict[speaker + "_" + utt][(start, end)] = ("label", (0, 0), 0, term) # Read forced alignments fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") print("Reading:", fa_fn) fa_dict = {} with codecs.open(fa_fn, "r", "utf-8") as fa_f: for line in fa_f: utt_key, _, start, duration, label = line.strip().split() start = float(start) duration = float(duration) end = start + duration start_frame = int(round(start * 100)) end_frame = int(round(end * 100)) if (label != "<unk>" and label != "sil" and label != "?" and label != "spn"): if not utt_key in fa_dict: fa_dict[utt_key] = {} fa_dict[utt_key][start_frame, end_frame] = label # Find ground truth terms with maximal overlap print("Getting ground truth terms with maximal overlap:") for utt_key in tqdm(fa_dict): # print(utt_key) if utt_key not in overlap_dict: continue for (fa_start, fa_end) in fa_dict[utt_key]: for (utd_start, utd_end) in overlap_dict[utt_key]: overlap = get_overlap(utd_start, utd_end, fa_start, fa_end) if overlap == 0: continue if (overlap > overlap_dict[utt_key][(utd_start, utd_end)][2]): overlap_dict[utt_key][(utd_start, utd_end)] = ( fa_dict[utt_key][(fa_start, fa_end)], (fa_start, fa_end), overlap, overlap_dict[utt_key][(utd_start, utd_end)][3]) # Write list with fixed labels if not path.isfile(fixed_labels_list_fn): print("Writing:", fixed_labels_list_fn) with codecs.open(fixed_labels_list_fn, "w", "utf-8") as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][0] overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, utd_start, utd_end)) else: print("Using existing file:", fixed_labels_list_fn) # Write list with fixed labels and segment intervals if not path.isfile(fixed_labels_segs_list_fn): print("Writing:", fixed_labels_segs_list_fn) with (codecs.open(fixed_labels_segs_list_fn, "w", "utf-8")) as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][0] fa_start, fa_end = (overlap_dict[utt_key][( utd_start, utd_end)][1]) overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, fa_start, fa_end)) else: print("Using existing file:", fixed_labels_segs_list_fn) # Write list with fixed segment intervals if not path.isfile(fixed_segs_list_fn): print("Writing:", fixed_segs_list_fn) with (codecs.open(fixed_segs_list_fn, "w", "utf-8")) as list_f: for utt_key in sorted(overlap_dict): for (utd_start, utd_end) in overlap_dict[utt_key]: label = overlap_dict[utt_key][(utd_start, utd_end)][3] fa_start, fa_end = (overlap_dict[utt_key][( utd_start, utd_end)][1]) overlap = (overlap_dict[utt_key][(utd_start, utd_end)][2]) if overlap != 0: list_f.write("{}_{}_{:06d}-{:06d}\n".format( label, utt_key, fa_start, fa_end)) else: print("Using existing file:", fixed_segs_list_fn) else: print("Using existing file:", fixed_labels_list_fn) print("Using existing file:", fixed_segs_list_fn) print("Using existing file:", fixed_labels_segs_list_fn) # Extract UTD with fixed labels input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_labels.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels") utils.segments_from_npz(input_npz_fn, fixed_labels_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract UTD with fixed segment intervals input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_segs.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels and segment " "intervals") utils.segments_from_npz(input_npz_fn, fixed_segs_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn) # Extract UTD with fixed labels and segment intervals input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz") output_npz_fn = path.join( feat_dir, args.language.lower() + ".train.utd_terms.fixed_labels_segs.npz") if not path.isfile(output_npz_fn): print("Extracting MFCCs for UTD tokens with fixed labels and segment " "intervals") utils.segments_from_npz(input_npz_fn, fixed_labels_segs_list_fn, output_npz_fn) else: print("Using existing file:", output_npz_fn)