def __init__(self, lexicon: Lexicon, P: k2.Fsa, device: torch.device, oov: str = '<UNK>'): ''' Args: L_inv: Its labels are words, while its aux_labels are phones. P: A phone bigram LM if the pronunciations in the lexicon are in phones; a word piece bigram if the pronunciations in the lexicon are word pieces. phones: The phone symbol table. words: The word symbol table. oov: Out of vocabulary word. ''' self.lexicon = lexicon L_inv = self.lexicon.L_inv.to(device) P = P.to(device) if L_inv.properties & k2.fsa_properties.ARC_SORTED != 0: L_inv = k2.arc_sort(L_inv) assert L_inv.requires_grad is False assert oov in self.lexicon.words self.L_inv = L_inv self.oov_id = self.lexicon.words[oov] self.oov = oov self.device = device phone_symbols = get_phone_symbols(self.lexicon.phones) phone_symbols_with_blank = [0] + phone_symbols ctc_topo = build_ctc_topo(phone_symbols_with_blank).to(device) assert ctc_topo.requires_grad is False ctc_topo_inv = k2.arc_sort(ctc_topo.invert_()) P_with_self_loops = k2.add_epsilon_self_loops(P) ctc_topo_P = k2.intersect(ctc_topo_inv, P_with_self_loops, treat_epsilons_specially=False).invert() self.ctc_topo_P = k2.arc_sort(ctc_topo_P)
def __init__(self, L_inv: k2.Fsa, L_disambig: k2.Fsa, G: k2.Fsa, phones: k2.SymbolTable, words: k2.SymbolTable, device: torch.device, oov: str = '<UNK>'): ''' Args: L_inv: Its labels are words, while its aux_labels are phones. L_disambig: L with disambig symbols. Its labels are phones and aux_labels are words. G: The language model. phones: The phone symbol table. words: The word symbol table. device: The target device that all FSAs should be moved to. oov: Out of vocabulary word. ''' L_inv = L_inv.to(device) G = G.to(device) if L_inv.properties & k2.fsa_properties.ARC_SORTED != 0: L_inv = k2.arc_sort(L_inv) if G.properties & k2.fsa_properties.ARC_SORTED != 0: G = k2.arc_sort(G) assert L_inv.requires_grad is False assert G.requires_grad is False assert oov in words L = L_inv.invert() L = k2.arc_sort(L) self.L_inv = L_inv self.L = L self.phones = phones self.words = words self.device = device self.oov_id = self.words[oov] phone_symbols = get_phone_symbols(phones) phone_symbols_with_blank = [0] + phone_symbols ctc_topo = k2.arc_sort( build_ctc_topo(phone_symbols_with_blank).to(device)) assert ctc_topo.requires_grad is False self.ctc_topo = ctc_topo self.ctc_topo_inv = k2.arc_sort(ctc_topo.invert()) lang_dir = Path('data/lang_nosp') if not (lang_dir / 'HLG_uni.pt').exists(): logging.info("Composing (ctc_topo, L_disambig, G)") first_phone_disambig_id = find_first_disambig_symbol(phones) first_word_disambig_id = find_first_disambig_symbol(words) # decoding_graph is the result of composing (ctc_topo, L_disambig, G) decoding_graph = compile_HLG( L=L_disambig.to('cpu'), G=G.to('cpu'), H=ctc_topo.to('cpu'), labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(decoding_graph.as_dict(), lang_dir / 'HLG_uni.pt') else: logging.info("Loading pre-compiled HLG") decoding_graph = k2.Fsa.from_dict( torch.load(lang_dir / 'HLG_uni.pt')) assert hasattr(decoding_graph, 'phones') self.decoding_graph = decoding_graph.to(device)