def run(self): with uopen(self.cluster_names, "rt") as f: cluster_names = [l.strip() for l in f] clusters = collections.defaultdict(set) with uopen(self.cluster_map, "rt") as f: t = ET.parse(f) for mi in t.findall(".//map-item"): k = mi.attrib["key"] v = int(mi.attrib["value"].split(".")[-1]) - 1 clusters[cluster_names[v]].add(k) c = corpus.Corpus() c.load(tk.uncached_path(self.corpus_file)) original_segments = {} for s in c.segments(): original_segments[s.fullname()] = s audio = {} transcriptions = {} for cluster_name in clusters: clusters[cluster_name] = list(sorted(clusters[cluster_name])) transcriptions[cluster_name] = " ".join( original_segments[s].orth for s in clusters[cluster_name]) audio[cluster_name] = [ (r.audio, s.start, s.end) for n in clusters[cluster_name] for s, r in [(original_segments[n], original_segments[n].recording)] ] new_c = corpus.Corpus() new_c.name = c.name for cluster_name, audio_files in audio.items(): out_path = os.path.join(self.audio_output.get_path(), cluster_name + ".wav") if os.path.exists(out_path): os.unlink(out_path) with open(f"{cluster_name}.txt", "wt") as f: for af in audio_files: f.write(f"file {af[0]}\ninpoint {af[1]}\n") if not math.isinf(af[2]): f.write(f"outpoint {af[2]}\n") self.sh( f"ffmpeg -loglevel fatal -hide_banner -f concat -safe 0 -i '{cluster_name}.txt' '{out_path}'" ) r = corpus.Recording() r.name = cluster_name r.audio = out_path s = corpus.Segment() s.name = "1" s.start = 0.0 s.end = float("inf") s.orth = transcriptions[cluster_name] r.add_segment(s) new_c.add_recording(r) new_c.dump(self.output_corpus.get_path())
def run(self): with uopen(self.bliss_lexicon, "rt") as f: tree = ET.parse(f) with uopen(self.out_g2p_lexicon, "wt") as out: all_lemmas = tree.findall(".//lemma") assert ( len(all_lemmas) > 0 ), "No lemma tag found in the lexicon file! Wrong format file?" for lemma in all_lemmas: if lemma.get("special") is not None: continue orth = lemma.find("orth").text.strip() if self.include_pronunciation_variants: phons = lemma.findall("phon") phon_single = [] for phon in phons: p = phon.text.strip() if p not in phon_single: phon_single.append(p) out.write("%s %s\n" % (orth, p)) else: phon = lemma.find("phon").text.strip() out.write("%s %s\n" % (orth, phon))
def run(self): with util.uopen(self.minuend, "rt") as fin: file_set1 = set(fin.read().split("\n")) with util.uopen(self.subtrahend, "rt") as fin: file_set2 = set(fin.read().split("\n")) with util.uopen(self.out_file, "wt") as fout: fout.write("\n".join(sorted(file_set1.difference(file_set2))))
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) dictionary = {} segments = None if self.segment_file: with uopen(self.segment_file) as f: segments = set(line.decode().strip() for line in f) for segment in c.segments(): orth = segment.orth.strip() key = segment.fullname() if segments: if (not self.invert_match and key not in segments and segment.name not in segments): continue if self.invert_match and key in segments: continue dictionary[key] = orth dictionary_string = pprint.pformat(dictionary, width=1000) with uopen(self.out_dictionary, "wt") as f: f.write(dictionary_string)
def run(self): with uopen(self.raw_dict_file) as read_f, uopen(self.out_dict, "w") as out_f: for line in read_f.readlines()[1:]: if line.startswith("#"): # skip comment continue parts = line.strip().split(" ", 1) if len(parts) < 2: continue token = parts[0].replace("&", "&") # e.g A&E -> A&E mapped_token = _map_token(token) # preprocessing as corpus out_f.write(mapped_token + " " + parts[1] + "\n")
def run(self): for idx in range(self.num_ramp_ups + 1): if os.path.exists(self.out_g2p_models[idx].get_path()): continue args = [ str(self.g2p_python), str(self.g2p_path), "-e", "utf-8", "-i", str(self.min_iter), "-I", str(self.max_iter), "-d", self.devel, "-s", self.size_constrains, "-n", "tmp-model", "-S", "-t", self.g2p_lexicon.get_path(), ] if idx > 0: args += ["-r", "-m", self.out_g2p_models[idx - 1].get_path()] args += self.extra_args if os.path.exists("tmp-model"): os.unlink("tmp-model") with uopen("stdout.%d" % idx, "w") as out: sp.check_call(args, stdout=out) with uopen("stdout.%d" % idx, "rt") as log: for line in log: if "total symbol errors" in line: error_rate = float(line.split("(")[1].split("%")[0]) self.out_error_rates[idx].set(error_rate) os.rename("tmp-model", self.out_g2p_models[idx].get_path()) best = min( ((idx, err_var.get()) for idx, err_var in enumerate(self.out_error_rates)), key=lambda t: t[1], ) os.symlink("model-%d" % best[0], self.out_best_model.get_path()) self.out_best_error_rate.set(best[1])
def run(self): c = corpus.Corpus() c.name = self.name with uopen(self.metadata, "rt") as metadata_file: for line in metadata_file: name, text, processed_text = line.split("|") audio_file_path = os.path.join( self.audio_folder.get_path(), name + ".wav" ) assert os.path.isfile( audio_file_path ), "Audio file %s was not found in provided audio path %s" % ( audio_file_path, self.audio_folder.get_path(), ) recording = corpus.Recording() recording.name = name recording.audio = audio_file_path segment = corpus.Segment() segment.orth = processed_text.strip() segment.name = name wave_info = wave.open(audio_file_path) segment.start = 0 segment.end = wave_info.getnframes() / wave_info.getframerate() wave_info.close() recording.add_segment(segment) c.add_recording(recording) c.dump(self.out_bliss_corpus.get_path())
def run(self): train_binary = os.path.join( tk.uncached_path(self.subword_nmt_repo), "subword_nmt/learn_bpe.py" ) args = [ sys.executable, train_binary, "-o", self.out_code_file.get_path(), "-s", str(self.symbols), "--min-frequency", str(self.min_frequency), ] if self.dict_input: args += ["--dict-input"] if self.total_symbols: args += ["--total-symbols"] text_corpus = self.text_corpus.get_path() with util.uopen(text_corpus, "rb") as f: p = sp.Popen(args, stdin=sp.PIPE, stdout=sys.stdout, stderr=sys.stderr) while True: data = f.read(4096) if len(data) > 0: p.stdin.write(data) else: break p.communicate() assert p.returncode == 0
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) if self.segment_file: with uopen(self.segment_file, "rt") as f: segments_whitelist = set(l.strip() for l in f.readlines() if len(l.strip()) > 0) else: segments_whitelist = None with uopen(self.out_txt.get_path(), "wt") as f: for segment in c.segments(): if (not segments_whitelist) or (segment.fullname() in segments_whitelist): f.write(segment.orth + "\n")
def run(self): lex = lexicon.Lexicon() phonemes = set() seen_lemma = {} with uopen(self.text_file.get_path()) as f: for line in f: # splitting is taken from RASR # src/Tools/Bliss/blissLexiconLib.py#L185 s = line.split(None, 1) orth = s[0].split("\\", 1)[0] phon_variants = [ tuple(p.split()) for p in s[1].split("\\") if p.strip() ] for phon_variant in phon_variants: phonemes.update(phon_variant) phon = [" ".join(v) for v in phon_variants] if orth in seen_lemma: lemma = seen_lemma[orth] for p in phon: if p not in lemma.phon: lemma.phon.append(p) else: lemma = lexicon.Lemma(orth=[orth], phon=phon) seen_lemma[orth] = lemma lex.add_lemma(lemma) for phoneme in sorted(phonemes): lex.add_phoneme(phoneme) write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def _read_ngrams(self, n): """ Read the ngrams knowing start and end lines """ with util.uopen(self.lm_path, "rt", encoding="utf-8") as infile: infile.seek(self.ngrams_start[n - 1][1]) i = self.ngrams_start[n - 1][0] - 1 while i < self.ngrams_end[n - 1]: i += 1 text = infile.readline() entry = text.split() prob = float(entry[0]) if len(entry) > n + 1: back = float(entry[-1]) words = entry[1 : n + 1] else: back = 0.0 words = entry[1:] ngram = " ".join(words) if (n == 1) and words[0] == "<s>": self.sentprob = prob prob = 0.0 if i - (self.ngrams_start[n - 1][0] - 1) % 1000 == 0: logging.info(f"Read 1000 {n}grams") yield ngram, (prob, back)
def run(self): segment_dict = {} for task_id, log_file in self.alignment_logs.items(): logging.info("Reading: {}".format(log_file)) file_path = tk.uncached_path(log_file) document = ET.parse(uopen(file_path)) _seg_list = document.findall(".//segment") for seg in _seg_list: avg = seg.find(".//score/avg") segment_dict[seg.attrib["full-name"]] = float(avg.text) del document logging.info("Scores has {} entries.".format(len(segment_dict))) score_np = np.asarray(list(segment_dict.values())) logging.info("Max {}; Min {}; Median {}".format( score_np.max(), score_np.min(), np.median(score_np))) avg_score_threshold = np.percentile(score_np, self.percentile) if np.isnan(avg_score_threshold): avg_score_threshold = np.inf logging.info("Avg Threshold is {} with percentile {}".format( avg_score_threshold, self.percentile)) if self.absolute_threshold is not None: avg_score_threshold = min(avg_score_threshold, self.absolute_threshold) logging.info("Threshold is {}".format(avg_score_threshold)) if self.plot: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt plot_percentile = np.percentile(score_np, 90) # there can be huge outliers np.clip(score_np, 0, 200, out=score_np) plt.hist(score_np, bins=100, range=(0, 200)) plt.xlabel("Average Maximum-Likelihood Score") plt.ylabel("Number of Segments") plt.title("Histogram of Alignment Scores") plt.savefig(fname=self.out_plot_avg.get_path()) # Only keep segments that are below the threshold filtered_segments = [ seg for seg, avg in segment_dict.items() if avg <= avg_score_threshold ] logging.info("Have {} entries after filtering.".format( len(filtered_segments))) for idx, segments in enumerate( chunks(filtered_segments, self.num_segments)): with open(self.out_single_segment_files[idx + 1].get_path(), "wt") as segment_file: for segment in segments: segment_file.write(segment + "\n") with open(self.out_single_file.get_path(), "wt") as segment_file: for segment in filtered_segments: segment_file.write(segment + "\n")
def run(self): lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) vocab = {k: v for v, k in enumerate(lex.phonemes.keys())} pickle.dump(vocab, uopen(self.out_vocab, "wb")) print("Vocab Size: %i" % len(lex.phonemes)) self.out_vocab_size.set(len(lex.phonemes))
def run(self): bpe_codes_cmd = [ sys.executable, os.path.join(tk.uncached_path(self.subword_nmt_repo), "learn_bpe.py"), "--output", self.out_bpe_codes.get_path(), "--symbols", str(self.bpe_size), ] util.create_executable("create_bpe_codes.sh", bpe_codes_cmd) with util.uopen(self.text_file, "rb") as f: p = sp.Popen( bpe_codes_cmd, stdin=sp.PIPE, stdout=sys.stdout, stderr=sys.stderr ) while True: data = f.read(4096) if len(data) > 0: p.stdin.write(data) else: break p.communicate() assert p.returncode == 0 bpe_vocab_cmd = [ sys.executable, os.path.join(tk.uncached_path(self.subword_nmt_repo), "create-py-vocab.py"), "--txt", self.text_file.get_path(), "--bpe", self.out_bpe_codes.get_path(), "--unk", self.unk_label, "--out", self.out_bpe_vocab.get_path(), ] util.create_executable("create_bpe_vocab.sh", bpe_vocab_cmd) sp.run(bpe_vocab_cmd, check=True) with util.uopen(self.out_bpe_vocab) as f: num_labels = max(list(eval(f.read()).values())) + 1 # 0-based index self.out_vocab_size.set(num_labels)
def run(self): d = eval(util.uopen(self.search_py_output, "r").read()) assert isinstance(d, dict) # seq_tag -> bpe string assert not os.path.exists(self.out_word_search_results.get_path()) with util.uopen(self.out_word_search_results, "w") as out: out.write("{\n") for seq_tag, entry in sorted(d.items()): if "#" in seq_tag: tag_split = seq_tag.split("/") recording_name, segment_name = tag_split[2].split("#") seq_tag = tag_split[0] + "/" + recording_name + "/" + segment_name if isinstance(entry, list): # n-best list as [(score, text), ...] out.write("%r: [\n" % (seq_tag)) for score, text in entry: out.write("(%f, %r),\n" % (score, text.replace("@@ ", ""))) out.write("],\n") else: out.write("%r: %r,\n" % (seq_tag, entry.replace("@@ ", ""))) out.write("}\n")
def filter(self): handle, tmp_path = mkstemp(dir=".", text=True) with uopen(self.out_g2p_lexicon, "rt") as lex, os.fdopen(handle, "wt") as fd_out: for line in lex: if len(line.strip().split("\t")) == 4: fd_out.write(line) fd_out.close() os.remove(self.out_g2p_lexicon) os.rename(tmp_path, self.out_g2p_lexicon)
def run(self): corpus = Corpus() corpus.load(self.bliss_corpus.get_path()) d = eval(util.uopen(self.recog_words_file.get_path(), "r").read()) assert isinstance( d, dict ), "only search output file with dict format is supported" with util.uopen(self.out_ctm_file.get_path(), "w") as out: out.write( ";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n" ) for seg in corpus.segments(): seg_start = 0.0 if seg.start == float("inf") else seg.start seg_end = 0.0 if seg.end == float("inf") else seg.end seg_fullname = seg.fullname() assert seg_fullname in d, "can not find {} in search output".format( seg_fullname ) out.write(";; %s (%f-%f)\n" % (seg_fullname, seg_start, seg_end)) assert isinstance( d[seg_fullname], str ), "no support for n-best lists yet" words = d[seg_fullname].split() # Just linearly interpolate the start/end of each word as time stamps are not given avg_dur = (seg_end - seg_start) * 0.9 / max(len(words), 1) for i in range(len(words)): if ( self.filter_tags and words[i].startswith("[") and words[i].endswith("]") ): continue out.write( "%s 1 %f %f %s 0.99\n" % ( seg.recording.name, seg_start + avg_dur * i, avg_dur, words[i], ) )
def run(self): speaker_id, gender, rec_name = None, None, None with uopen(self.speakers_stats_file) as read_f, uopen( self.out_speakers_list, "w") as out_f: for line in read_f: l = line.strip().split() if len(l) < 2: continue if l[1] == "F" or l[1] == "M": # start new speaker speaker_id = l[0] gender = l[1] rec_name = l[2] elif l[0].endswith("A") or l[0].endswith( "B"): # recording name rec_name = l[0] else: continue if speaker_id: out_f.write(speaker_id + " " + gender + " " + rec_name + "\n") # speaker_id gender recording
def run(self): with uopen(self.out_g2p_lexicon, "wt") as out: with uopen(self.out_g2p_untranslated, "wt") as err: sp.check_call( [ str(self.g2p_python), str(self.g2p_path), "-e", "utf-8", "-V", str(self.variants_mass), "--variants-number", str(self.variants_number), "-m", self.g2p_model.get_path(), "-a", self.word_list.get_path(), ], stdout=out, stderr=err, )
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) if self.segment_file: with uopen(self.segment_file.get_path(), "rt") as f: segments_whitelist = set(l.strip() for l in f.readlines() if len(l.strip()) > 0) segment_iterator = filter( lambda s: s.fullname() in segments_whitelist, c.segments()) else: segment_iterator = c.segments() with uopen(self.text_file, "rt") as f: for segment, line in itertools.zip_longest(segment_iterator, f): assert (segment is not None ), "there were more text file lines than segments" assert line is not None, "there were less text file lines than segments" assert len(line) > 0 segment.orth = line.strip() c.dump(self.out_corpus.get_path())
def run(self): transform = (lambda s: s.lower()) if self.case_sensitive else (lambda s: s) with uopen(tk.uncached_path(self.bliss_lexicon), "r") as lexicon_file: old_lexicon = ET.fromstring(lexicon_file.read()) with uopen(tk.uncached_path(self.word_list), "r") as words_file: words = set([transform(w.strip()) for w in words_file.readlines()]) root = ET.Element("lexicon") root.append(old_lexicon.find("phoneme-inventory")) for lemma in old_lexicon.findall("lemma"): if any( transform(orth.text) in words or (orth.text is not None and orth.text.startswith("[")) for orth in lemma.findall("orth") ): root.append(lemma) with uopen(self.out_bliss_lexicon.get_path(), "wt") as lexicon_file: lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n') lexicon_file.write(ET.tostring(root, "unicode"))
def run(self): tag_map = {} c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) all_tags = [("d%d" % i, "default%d" % i, "all other segments of category %d" % i) for i in range(len(self.tag_mapping) + 1)] for segment in c.segments(): tag_map[segment.fullname()] = [ "d%d" % i for i in range(len(self.tag_mapping) + 1) ] for i, (tag, segments) in enumerate(self.tag_mapping): all_tags.append(tag) for file in segments.values(): for segment in uopen(file): if segment.rstrip() in tag_map: tag_map[segment.rstrip()][i] = tag[0] with uopen(self.out_stm_path, "wt") as out: for segment in c.segments(): speaker_name = (segment.speaker().name if segment.speaker() is not None else segment.recording.name) segment_track = segment.track + 1 if segment.track else 1 out.write("%s %d %s %5.2f %5.2f <%s> %s\n" % ( segment.recording.name, segment_track, speaker_name, segment.start, segment.end, ",".join(tag_map[segment.fullname()]), segment.orth, )) for tag in all_tags: out.write(';; LABEL "%s" "%s" "%s"\n' % tag)
def run(self): with uopen(self.g2p_lexicon, "rt", encoding="utf-8") as f: oov_words = dict() for orth, data in it.groupby( map(lambda line: line.strip().split("\t"), f), lambda t: t[0]): oov_words[orth] = [] for d in data: if len(d) == 4: oov_words[orth].append(d[3]) elif len(d) < 4: logging.warning( 'No pronunciation found for orthography "{}"'. format(orth)) else: logging.warning( 'Did not fully parse entry for orthography "{}"'. format(orth)) with uopen(self.iv_bliss_lexicon, "rt") as f: iv_lexicon = ET.parse(f) if self.merge: root = iv_lexicon.getroot() else: root = ET.Element("lexicon") root.append(iv_lexicon.find("phoneme-inventory")) for orth, prons in oov_words.items(): lemma = ET.SubElement(root, "lemma") ET.SubElement(lemma, "orth").text = orth for pron in prons: ET.SubElement(lemma, "phon").text = pron with uopen(self.out_oov_lexicon, "wt", encoding="utf-8") as f: tree = ET.ElementTree(root) tree.write(f, "unicode", True)
def run(self): with uopen(tk.uncached_path(self.bliss_lexicon), "r") as lexicon_file: lexicon = ET.fromstring(lexicon_file.read()) words = set() for e in lexicon.findall("./lemma/orth"): if ( e.text is not None and len(e.text) > 0 and not (e.text.startswith("[") and self.apply_filter) ): words.add(e.text) with open(self.out_word_list.get_path(), "w") as word_file: for w in sorted(words): word_file.write("%s\n" % w)
def run(self): c = libcorpus.Corpus() c.load(self.bliss_corpus.get_path()) words = Counter() for s in c.segments(): words.update(s.orth.strip().split()) counts = [(v, k) for k, v in words.items()] with uopen(self.out_word_counts, "wt") as f: f.write( "\n".join( "%d\t%s" % t for t in sorted(counts, key=lambda t: (-t[0], t[1])) ) )
def run(self): if self.casing != "none": logging.warning( "The orthography/lemma casing is changed. Is this what you want? Normally you should set this to 'none'. For legacy reasons this is set to 'upper'." ) def change_casing(text_str): if self.casing == "upper": return text_str.upper() elif self.casing == "lower": return text_str.lower() elif self.casing == "none": return text_str else: raise NotImplementedError with uopen(self.bliss_lexicon, "rt", encoding="utf-8") as f: tree = ET.parse(f) iv_words = { change_casing(orth.text) for orth in tree.findall(".//lemma/orth") if orth.text } with uopen(self.bliss_corpus, "rt", encoding="utf-8") as f: tree = ET.parse(f) oov_words = { w for kw in tree.findall(".//recording/segment/orth") for w in kw.text.strip().split() if change_casing(w) not in iv_words } with uopen(self.out_oov_words, "wt") as f: for w in sorted(oov_words): f.write("%s\n" % w)
def run(self): state_tying_dict = {} with uopen(self.state_tying_file, 'r') as f: for line in f: if not line.strip(): continue tokens = line.split() assert len(tokens) >= 2, line state_tying_dict[tokens[0]] = ' '.join(tokens[1:]) vocab_dict = {} max_id = 0 for allo, index in state_tying_dict.items(): phon = allo.split('{')[0] if not phon.startswith('[') and not phon.endswith(']'): # only eow for single-phon pronunciation, i.e. @i@f if self.add_eow and ('@f' in allo): phon += '#' elif self.add_sow and ('@i' in allo): phon = '#' + phon if phon in vocab_dict: assert vocab_dict[ phon] == index, "index conflict for %s: %s vs. %s (%s %s)" % ( phon, index, vocab_dict[phon], allo, index) else: vocab_dict[phon] = index if int(index) > max_id: max_id = int(index) with uopen(self.out_label_file, 'w') as f: for v in sorted(vocab_dict.keys()): f.write(v + ' ' + vocab_dict[v] + '\n') print('number of classes:', len(set(vocab_dict.values()))) assert len(set(vocab_dict.values( ))) == max_id + 1, "expected number of classes %d" % (max_id + 1)
def _get_speakers(self): """ Extract the speakers from the SPEAKERS.TXT file """ with uopen(self.speaker_metadata, "r") as speakersfile: for line in speakersfile: if line[0] == ";": continue procline = list(map(str.strip, line.split("|"))) self._speakers[int(procline[0])] = [ procline[1], procline[2], float(procline[3]), procline[4], ]
def prior_and_plot(self): all_scores = [] all_frames = [] max_emission = 0 for l in self.out_log_file.values(): with util.uopen(l.get_path(), "rb") as f: tree = ET.parse(f) scores = {} activations = tree.find(".//activations") all_frames.append(int(activations.find("num_frames").text.strip())) for score_elem in activations.findall("score"): emission = int(score_elem.attrib["emission"]) max_emission = max(max_emission, emission) score = float(score_elem.text.strip()) scores[emission] = score all_scores.append(scores) total_frames = sum(all_frames) merged_scores = [0.0 for _ in range(max_emission + 1)] for frames, scores in zip(all_frames, all_scores): scale = frames / total_frames for k, v in scores.items(): merged_scores[k] += scale * v total_mass = sum(merged_scores) merged_scores = [s / total_mass for s in merged_scores] with open(self.out_prior.get_path(), "wt") as f: f.write( '<?xml version="1.0" encoding="UTF-8"?>\n<vector-f32 size="%d">\n' % len(merged_scores)) f.write(" ".join("%.20e" % math.log(s) for s in merged_scores) + "\n") f.write("</vector-f32>") if self.plot_prior: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt xdata = range(len(merged_scores)) plt.semilogy(xdata, merged_scores) plt.xlabel("emission idx") plt.ylabel("prior") plt.grid(True) plt.savefig(self.out_prior_plot.get_path())
def _get_rec_to_segs_map(self): """ Returns recording to list of segments mapping """ rec_to_segs = defaultdict(list) for trans_file in glob.glob( os.path.join(self.trans_dir.get_path(), "*/*/*-trans.text")): with uopen(trans_file, "rt") as f: for line in f: seg_info = line.strip().split(" ", 3) # name start end orth assert len(seg_info) == 4 rec_name = (seg_info[0].split("-")[0].replace("sw", "sw0") ) # e.g: sw2001A-ms98-a-0022 -> sw02001A rec_to_segs[rec_name].append(seg_info) return rec_to_segs