def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) # build lookup dict lookup_dict = {} for lemma in lex.lemmata: for orth in lemma.orth: if orth and self.strategy == LexiconStrategy.PICK_FIRST: if len(lemma.phon) > 0: lookup_dict[orth] = lemma.phon[0] word_separation_phon = lookup_dict[self.word_separation_orth] print("using word separation symbold: %s" % word_separation_phon) separator = " %s " % word_separation_phon for segment in c.segments(): try: words = [lookup_dict[w] for w in segment.orth.split(" ")] segment.orth = separator.join(words) except LookupError: raise LookupError( "Out-of-vocabulary word detected, please make sure that there are no OOVs remaining by e.g. applying G2P" ) c.dump(self.out_corpus.get_path())
def run(self): lex = lexicon.Lexicon() if self.sort_phonemes: sorted_phoneme_list = [ (k, self.static_lexicon.phonemes[k]) for k in sorted(self.static_lexicon.phonemes.keys()) ] for phoneme_tuple in sorted_phoneme_list: lex.add_phoneme(symbol=phoneme_tuple[0], variation=phoneme_tuple[1]) else: lex.phonemes = self.static_lexicon.phonemes if self.sort_lemmata: lemma_dict = {} for lemma in self.static_lexicon.lemmata: # sort by first orth entry lemma_dict[lemma.orth[0]] = lemma lex.lemmata = [ lemma_dict[key] for key in sorted(lemma_dict.keys()) ] else: lex.lemmata = self.static_lexicon.lemmata write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def run(self): lex = lexicon.Lexicon() phonemes = set() seen_lemma = {} with uopen(self.text_file.get_path()) as f: for line in f: # splitting is taken from RASR # src/Tools/Bliss/blissLexiconLib.py#L185 s = line.split(None, 1) orth = s[0].split("\\", 1)[0] phon_variants = [ tuple(p.split()) for p in s[1].split("\\") if p.strip() ] for phon_variant in phon_variants: phonemes.update(phon_variant) phon = [" ".join(v) for v in phon_variants] if orth in seen_lemma: lemma = seen_lemma[orth] for p in phon: if p not in lemma.phon: lemma.phon.append(p) else: lemma = lexicon.Lemma(orth=[orth], phon=phon) seen_lemma[orth] = lemma lex.add_lemma(lemma) for phoneme in sorted(phonemes): lex.add_phoneme(phoneme) write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def run(self): lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) orth2lemmata = collections.defaultdict(list) for lemma in lex.lemmata: if lemma.special: continue num_orths = len(lemma.orth) if num_orths < 1: continue if num_orths > 1 and not self.merge_multi_orths_lemmata: continue orth2lemmata[lemma.orth[0]].append(lemma) for orth, lemmata in orth2lemmata.items(): if len(lemmata) < 2: continue final_lemma = lemmata[0] for lemma in lemmata[1:]: for orth in lemma.orth: if orth not in final_lemma.orth: final_lemma.orth.append(orth) for phon in lemma.phon: if phon not in final_lemma.phon: final_lemma.phon.append(phon) if final_lemma.synt is None and lemma.synt is not None: final_lemma.synt = lemma.synt for eval in lemma.eval: if eval not in final_lemma.eval: final_lemma.eval.append(eval) lex.lemmata.remove(lemma) write_xml(self.out_bliss_lexicon, element_tree=lex.to_xml())
def get_static_lexicon(): """ Add the phoneme and lemma entries for special TTS symbols :param bool include_punctuation: :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma(orth=["[space]"], phon=["[space]"]) ) lex.add_phoneme("[space]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[start]"], phon=["[start]"]) ) lex.add_phoneme("[start]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[end]"], phon=["[end]"]) ) lex.add_phoneme("[end]", variation="none") return lex
def run(self): merged_lex = lexicon.Lexicon() lexica = [] for lexicon_path in self.lexica: lex = lexicon.Lexicon() lex.load(lexicon_path.get_path()) lexica.append(lex) # combine the phonemes merged_phonemes = OrderedDict() for lex in lexica: for symbol, variation in lex.phonemes.items(): if symbol in merged_phonemes.keys(): assert variation == merged_phonemes[symbol], ( "conflicting phoneme variant for phoneme: %s" % symbol) else: merged_phonemes[symbol] = variation if self.sort_phonemes: sorted_phoneme_list = [(k, merged_phonemes[k]) for k in sorted(merged_phonemes.keys())] for phoneme_tuple in sorted_phoneme_list: merged_lex.add_phoneme(symbol=phoneme_tuple[0], variation=phoneme_tuple[1]) else: merged_lex.phonemes = merged_phonemes # combine the lemmata if self.sort_lemmata: lemma_dict = defaultdict(list) for lex in lexica: for lemma in lex.lemmata: # sort by first orth entry orth_key = lemma.orth[0] if lemma.orth else "" lemma_dict[orth_key].append(lemma) merged_lex.lemmata = list( itertools.chain( *[lemma_dict[key] for key in sorted(lemma_dict.keys())])) else: for lex in lexica: # check for existing orths to avoid overlap merged_lex.lemmata.extend(lex.lemmata) write_xml(self.out_bliss_lexicon.get_path(), merged_lex.to_xml())
def run(self): lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) vocab = {k: v for v, k in enumerate(lex.phonemes.keys())} pickle.dump(vocab, uopen(self.out_vocab, "wb")) print("Vocab Size: %i" % len(lex.phonemes)) self.out_vocab_size.set(len(lex.phonemes))
def _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping=True): """ Generate the special lemmas for LibriSpeech Librispeech uses silence, sentence begin/end and unknown, but no other special tokens. :param bool add_unknown_phoneme_and_mapping: add [UNKNOWN] as phoneme, otherwise add only the lemma without it :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["[SILENCE]"], synt=[], special="silence", eval=[[]], ) ) lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-BEGIN]"], synt=["<s>"], special="sentence-begin") ) lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-end") ) if add_unknown_phoneme_and_mapping: lex.add_lemma( lexicon.Lemma( orth=["[UNKNOWN]"], phon=["[UNKNOWN]"], synt=["<UNK>"], special="unknown", ) ) else: lex.add_lemma( lexicon.Lemma( orth=["[UNKNOWN]"], synt=["<UNK>"], special="unknown", ) ) lex.add_phoneme("[SILENCE]", variation="none") if add_unknown_phoneme_and_mapping: lex.add_phoneme("[UNKNOWN]", variation="none") return lex
def _fix_hash_for_lexicon(cls, new_lexicon): """ The "old" lexicon had an incorrect "synt" type, after fixing the hashes for the lexicon changed, so this job here needs to revert the lexicon to the old "synt" type. :param lexicon.Lexicon new_lexicon: :return: lexicon in the legacy format :type: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.phonemes = new_lexicon.phonemes lex.lemmata = [] for new_lemma in new_lexicon.lemmata: lemma = copy.deepcopy(new_lemma) lemma.synt = [new_lemma.synt] if new_lemma.synt is not None else [] lex.lemmata.append(lemma) return lex
def get_static_lexicon(): """ Add the phoneme and lemma entries for special and punctuation :param bool include_punctuation: :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma(orth=["[space]", ""], phon=["[space]"], special="silence")) lex.add_phoneme("[space]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[start]"], phon=["[start]"], special="sentence-begin")) lex.add_phoneme("[start]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[end]"], phon=["[end]"], special="sentence-end")) lex.add_phoneme("[end]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["."], phon=["[dot]"])) lex.add_phoneme("[dot]", variation="none") lex.add_lemma(lexicon.Lemma(orth=[","], phon=["[comma]"])) lex.add_phoneme("[comma]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["?"], phon=["[question_mark]"])) lex.add_phoneme("[question_mark]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["!"], phon=["[exclamation_mark]"])) lex.add_phoneme("[exclamation_mark]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["-"], phon=["[hyphen]"])) lex.add_phoneme("[hyphen]", variation="none") lex.add_lemma(lexicon.Lemma(orth=['"'], phon=["[quotation]"])) lex.add_phoneme("[quotation]", variation="none") return lex
def run(self): with uopen(tk.uncached_path(self.word_list_file), "rt") as f: words = [l.strip() for l in f] phonemes = set() for w in words: phonemes.update(w) phonemes.discard(" ") # just in case lex = lexicon.Lexicon() lex.add_phoneme("sil", variation="none") for p in sorted(phonemes): p = self.transforms.get(p, p) lex.add_phoneme(p, "context") if self.add_unknown: lex.add_phoneme("unk", "none") if self.add_noise: lex.add_phoneme("noise", "none") # TODO: figure out requirements on synt/eval element for differnt types of lemmata # silence lemma, needs synt/eval element with empty token sequence lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["sil"], synt=[], special="silence", eval=[[]], ) ) # sentence border lemmata, needs no eval element lex.add_lemma( lexicon.Lemma( orth=["[SENTENCE_BEGIN]"], synt=["<s>"], special="sentence-begin" ) ) lex.add_lemma( lexicon.Lemma( orth=["[SENTENCE_END]"], synt=["</s>"], special="sentence-end" ) ) # unknown lemma, needs no synt/eval element if self.add_unknown: lex.add_lemma( lexicon.Lemma(orth=["[UNKNOWN]"], phon=["unk"], special="unknown") ) # TODO: synt = ["<UNK>"] ??? # noise lemma, needs empty synt token sequence but no eval element? if self.add_noise: lex.add_lemma( lexicon.Lemma( orth=["[NOISE]"], phon=["noise"], synt=[], special="unknown", ) ) for w in words: l = lexicon.Lemma() l.orth.append(w) l.phon.append(" " + " ".join(self.transforms.get(p, p) for p in w) + " ") lex.add_lemma(l) with uopen(self.out_bliss_lexicon.get_path(), "w") as lexicon_file: lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n') lexicon_file.write(ET.tostring(lex.to_xml(), "unicode"))
def get_special_lemma_lexicon(): """ Generate the special phonemes/lemmas for Switchboard :rtype lexicon.Lexicon """ lex = lexicon.Lexicon() tags = ["[SILENCE]", "[NOISE]", "[VOCALIZED-NOISE]", "[LAUGHTER]"] tag_to_phon = { "[SILENCE]": "[SILENCE]", "[NOISE]": "[NOISE]", "[VOCALIZED-NOISE]": "[VOCALIZEDNOISE]", "[LAUGHTER]": "[LAUGHTER]", } for tag in tags: lex.add_phoneme(tag_to_phon[tag], variation="none") # add non-special lemmas for tag in tags[1:]: # silence is considered below lex.add_lemma( lexicon.Lemma( orth=[tag], phon=[tag_to_phon[tag]], synt=[], eval=[[]], )) # create special lemmas lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-boundary")) lex.add_lemma( lexicon.Lemma( orth=["[sentence-begin]"], synt=["<s>"], eval=[[]], special="sentence-begin", )) lex.add_lemma( lexicon.Lemma(orth=["[sentence-end]"], synt=["</s>"], eval=[[]], special="sentence-end")) lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["[SILENCE]"], synt=[], eval=[[]], special="silence", )) lex.add_lemma( lexicon.Lemma(orth=["[UNKNOWN]"], synt=["<unk>"], eval=[[]], special="unknown")) return lex