def strset2fst(strs, fstclass=openfst.StdVectorFst): """ Build a dictionary lookup FST for a set of strings. """ fst = fstclass() isyms = openfst.SymbolTable("chars") osyms = openfst.SymbolTable("words") isyms.AddSymbol("ε") osyms.AddSymbol("ε") start = fst.AddState() fst.SetStart(start) for s in strs: prev = start for c in s: nxt = fst.AddState() isym = isyms.AddSymbol(c) fst.AddArc(prev, isym, 0, 0, nxt) prev = nxt nxt = fst.AddState() osym = osyms.AddSymbol(s) fst.AddArc(prev, 0, osym, 0, nxt) fst.SetFinal(nxt, 0) dfst = fstclass() openfst.Determinize(fst, dfst) openfst.RmEpsilon(dfst) dfst.SetInputSymbols(isyms) dfst.SetOutputSymbols(osyms) return dfst
def __init__(self, isyms=None, osyms=None, ssyms=None): openfst.StdVectorFst.__init__(self) if isyms == None: isyms = openfst.SymbolTable("inputs") isyms.AddSymbol("ε") if osyms == None: osyms = openfst.SymbolTable("outputs") osyms.AddSymbol("ε") if ssyms == None: ssyms = openfst.SymbolTable("states") ssyms.AddSymbol("__START__") self.ssyms = ssyms self.SetInputSymbols(isyms) self.SetOutputSymbols(osyms) self.SetStart(self.AddState())
def fstcompile(infile): fst = openfst.StdVectorFst() symtab = openfst.SymbolTable("symbols") symtab.AddSymbol("ε") statemap = collections.defaultdict(fst.AddState) for spam in infile: fields = spam.strip().split() if len(fields) == 1: fst.SetFinal(int(fields[0]), 0) elif len(fields) == 2: fst.SetFinal(int(fields[0]), float(fields[1])) elif len(fields) > 2: if len(fields) > 3: prob = float(fields[3]) else: prob = 1.0 if fields[2] == 'eps': fields[2] = 'ε' sym = symtab.AddSymbol(fields[2]) src = statemap[fields[0]] dest = statemap[fields[1]] fst.AddArc(src, sym, sym, -math.log(prob), dest) fst.SetStart(0) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) return fst
def sent2fst(txt, fstclass=openfst.StdVectorFst, isyms=None, omitstart=True): """ Convert a list of words, or a string of whitespace-separated tokens, to a sentence FST. """ fst = fstclass() start = fst.AddState() fst.SetStart(start) if isyms: symtab = isyms else: symtab = openfst.SymbolTable("words") symtab.AddSymbol("ε") prev = start if isinstance(txt, str): txt = txt.split() for c in txt: if omitstart and c == '<s>': continue nxt = fst.AddState() if isyms: sym = isyms.Find(c) if sym == -1: #print "Warning, unknown word", c continue else: sym = symtab.AddSymbol(c) #print prev, sym, nxt fst.AddArc(prev, sym, sym, 0, nxt) prev = nxt fst.SetFinal(nxt, 0) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) return fst
def build_classfst(probdef, isyms=None): """ Build an FST from the classes in a Sphinx probability definition file. This transducer maps words to classes, and can either be composed with the input, or pre-composed with the language model. In the latter case you can project the resulting transducer to its input to obtain an equivalent non-class-based model. """ if not isinstance(probdef, SphinxProbdef): probdef = SphinxProbdef(probdef) fst = openfst.StdVectorFst() if isyms: symtab = isyms else: symtab = openfst.SymbolTable("words") symtab.AddSymbol("ε") st = fst.AddState() fst.SetStart(st) fst.SetFinal(st, 0) for word, label in symtab: if label == openfst.epsilon: continue fst.AddArc(st, label, label, 0, st) for c in probdef.classes: clabel = symtab.AddSymbol(c) for word, prob in probdef.classes[c].iteritems(): wlabel = symtab.AddSymbol(word) fst.AddArc(st, wlabel, clabel, -math.log(prob), st) fst.SetOutputSymbols(symtab) fst.SetInputSymbols(symtab) return fst
def testConvertSymbols(self): syms1 = openfst.SymbolTable("syms1") syms1.AddSymbol("ε") syms1.AddSymbol("foo", 1) syms1.AddSymbol("bar", 2) syms2 = openfst.SymbolTable("syms2") syms2.AddSymbol("ε") syms2.AddSymbol("bar", 1) syms2.AddSymbol("foo", 2) self.assertEquals(syms1.Find("foo"), 1) self.assertEquals(syms2.Find("foo"), 2) self.assertEquals(syms1.Find(1), "foo") self.assertEquals(syms2.Find(2), "foo") fst = openfst.StdVectorFst() st = fst.AddState() nst = fst.AddState() fst.AddArc(st, 1, 1, 0, nst) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 1) self.assertEquals(arc.olabel, 1) fst.SetInputSymbols(syms1) fst.SetOutputSymbols(syms1) openfst.ConvertSymbols(fst, syms2, True, True) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 2) self.assertEquals(arc.olabel, 2) openfst.ConvertSymbols(fst, syms1, True, False) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 1) self.assertEquals(arc.olabel, 2) fst.AddArc(st, 42, 69, 0, nst) try: openfst.ConvertSymbols(fst, syms2, True, False) except: pass else: self.Fail("expected failure for unknown symbol")
def build_dictfst(lmfst): """ Build a character-to-word FST based on the symbol table of lmfst. """ insym = openfst.SymbolTable("letters") insym.AddSymbol("ε") outsym = lmfst.InputSymbols() fst = openfst.StdVectorFst() start = fst.AddState() fst.SetStart(start) final = fst.AddState() fst.SetFinal(final, 0) for w, wsym in outsym: if wsym == 0: continue # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.AddSymbol(c) for w, wsym in outsym: if wsym == 0: continue wsym = outsym.Find(w) # Add an epsilon:word arc to the first state of this word prev = fst.AddState() fst.AddArc(start, openfst.StdArc(0, wsym, 0, prev)) # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.Find(c) next = fst.AddState() fst.AddArc(prev, openfst.StdArc(csym, 0, 0, next)) prev = next # And an epsilon arc to the final state fst.AddArc(prev, openfst.StdArc(0, 0, 0, final)) fst.SetInputSymbols(insym) fst.SetOutputSymbols(outsym) return fst
def str2fst(txt, fstclass=openfst.StdVectorFst): """ Convert a text string to an FST. """ fst = fstclass() start = fst.AddState() fst.SetStart(start) symtab = openfst.SymbolTable("chars") symtab.AddSymbol("ε") prev = start for c in txt: nxt = fst.AddState() sym = symtab.AddSymbol(c) fst.AddArc(prev, sym, sym, 0, nxt) prev = nxt fst.SetFinal(nxt, 0) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) return fst
def build_lmfst(lm, use_phi=False): """ Build an FST recognizer from an N-gram backoff language model. """ fst = openfst.StdVectorFst() symtab = openfst.SymbolTable("words") epsilon = symtab.AddSymbol("ε") if use_phi: phi = symtab.AddSymbol("φ") bo_label = phi else: bo_label = epsilon for ug in lm.mgrams(0): wsym = symtab.AddSymbol(ug.words[0]) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) # The algorithm goes like this: # # Create a backoff state # For M in 1 to N-1: # For each M-gram w(1,M): # Create a state q(1,M) # Create an arc from state q(1,M-1) to q(1,M) with weight P(w(1,M)) # Create an arc from state q(1,M) to q(2,M) with weight bowt(w(1,M-1)) # For each N-gram w(1,N): # Create an arc from state q(1,N-1) to q(2,N) with weight P(w(1,N)) # Table holding M-gram to state mappings sidtab = {} fst.AddState() # guaranteed to be zero (we hope) for m in range(lm.get_size() - 1): add_mgram_states(fst, symtab, lm, m, sidtab, bo_label) add_ngram_arcs(fst, symtab, lm, lm.get_size(), sidtab) # Connect and arc-sort the resulting FST openfst.Connect(fst) openfst.ArcSortInput(fst) return fst
# -*- coding: utf-8 -*- from pylab import * import openfst from openfst import StdVectorFst as FST from openfst import LogVectorFst as LFST ASCII = openfst.SymbolTable("ASCII") for i in range(127): if i == 0: ASCII.AddSymbol("ϵ", i) elif i <= 32: ASCII.AddSymbol("$%02x" % i, i) else: ASCII.AddSymbol(chr(i), i) def minimize(fst): dfst = FST() openfst.Determinize(fst, dfst) openfst.Minimize(dfst) return dfst def log_minimize(fst): dfst = LFST() openfst.Determinize(fst, dfst) openfst.Minimize(dfst) return dfst
def build_lattice_fsg(dag, syms=None, ascale=0, pscale=0, addsyms=False, determinize=True, baseword=baseword): """ Build an FSM from a Sphinx word lattice. """ fst = openfst.StdVectorFst() if syms == None: fsgsyms = openfst.SymbolTable("words") fsgsyms.AddSymbol("ε") fsgsyms.AddSymbol("σ") fsgsyms.AddSymbol("ρ") fsgsyms.AddSymbol("φ") addsyms = True else: fsgsyms = syms statemap = {} j = 0 for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue # These should not exist, but they do (!!) if n.sym == "<s>" and n.entry != 0: continue if n not in statemap: statemap[n] = fst.AddState() if addsyms: fsgsyms.AddSymbol(baseword(n.sym)) statemap[dag.start] = fst.AddState() fst.SetStart(statemap[dag.start]) for n in dag.nodes: if n not in statemap: continue sym = fsgsyms.Find(baseword(n.sym)) for x in n.exits: if x.dest not in statemap: continue weight = 0 # Turn OOVs and non-events into epsilons if sym == -1 or n.sym == "<s>": sym = 0 if ascale: weight = -x.ascr * ascale elif pscale: weight = -x.post * pscale fst.AddArc(statemap[x.src], sym, sym, weight, statemap[x.dest]) # Add a </s> transition if none exists if '</s>' not in [x.src.sym for x in dag.end.entries]: end = fst.AddState() sym = fsgsyms.AddSymbol("</s>") fst.AddArc(statemap[dag.end], sym, sym, 0, end) fst.SetFinal(end, 0) else: fst.SetFinal(statemap[dag.end], 0) # Epsilon-remove it (like bypassing fillers...) (FIXME: umm...) openfst.RmEpsilon(fst) # Don't determinize if it's weighted if ascale or pscale: determinize = False if determinize: outfst = openfst.StdVectorFst() openfst.Determinize(fst, outfst) fst = outfst fst.SetInputSymbols(fsgsyms) fst.SetOutputSymbols(fsgsyms) return fst