def lat2flat(latfile, fsgfile, lmfst): """ Subset a language model using the vocabulary of a lattice. """ dag = lattice.Dag(latfile) fst = openfst.StdVectorFst() fst.SetStart(fst.AddState()) fst.SetFinal(0, 0) syms = lmfst.InputSymbols() seen = set() for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue if n.sym in seen: continue seen.add(n.sym) sym = syms.Find(baseword(n.sym)) if sym == -1: continue fst.AddArc(0, sym, sym, 0, 0) fst.SetOutputSymbols(lmfst.InputSymbols()) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) cfst = openfst.StdComposeFst(fst, lmfst, opts) else: cfst = openfst.StdComposeFst(fst, lmfst) outfst = openfst.StdVectorFst() openfst.Determinize(cfst, outfst) # Write it back out as an FSG for PocketSphinx. build_fsg_fst(outfst, fsgfile) return outfst
def testComposePhi(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 2, 2, 0, 1) a.AddArc(0, 3, 3, 0, 1) a.AddArc(0, 4, 4, 0, 1) a.SetStart(0) a.SetFinal(0, 0) b = openfst.StdVectorFst() b.AddState() b.AddState() b.AddArc(0, 1, 1, 1, 1) b.AddArc(0, 2, 2, 0, 0) b.AddArc(1, 3, 3, 0, 1) b.AddArc(1, 4, 4, 0, 1) b.SetStart(0) b.SetFinal(0, 0) b.SetFinal(1, 0) opts = openfst.StdPhiComposeOptions() opts.matcher2 = openfst.StdPhiMatcher(b, openfst.MATCH_INPUT, 1) # This is necessary for reasons I do not understand opts.matcher1 = openfst.StdPhiMatcher(a, openfst.MATCH_NONE) c = openfst.StdComposeFst(a, b, opts) for s in c: for arc in c.iterarcs(s): if arc.ilabel == 2: self.assertEquals(arc.weight.Value(), 0) self.assertEquals(arc.nextstate, 1) elif arc.ilabel == 3 or arc.ilabel == 4: self.assertEquals(arc.weight.Value(), 1) self.assertEquals(arc.nextstate, 2)
def testCompose(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 1, 2, 0, 1) a.AddArc(0, 2, 3, 0, 1) a.AddArc(0, 3, 3, 1, 1) a.SetStart(0) a.SetFinal(1, 0) b = openfst.StdVectorFst() b.AddState() b.AddState() b.AddArc(0, 1, 2, 0, 1) b.AddArc(0, 2, 3, 0, 1) b.AddArc(0, 3, 3, 1, 1) b.SetStart(0) b.SetFinal(1, 0) c = openfst.StdComposeFst(a, b) for s in c: for arc in c.iterarcs(s): self.assertEquals(arc.nextstate, 1) if arc.ilabel == 1: self.assertEquals(arc.olabel, 3) if arc.ilabel == 2: self.assertEquals(arc.olabel, 3) self.assertEquals(arc.weight.Value(), 1) if arc.ilabel == 3: self.assertEquals(arc.olabel, 3) self.assertEquals(arc.weight.Value(), 2)
def testComposeRho(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 2, 2, 0, 1) a.AddArc(0, 3, 3, 0, 1) a.AddArc(0, 4, 4, 0, 1) a.SetStart(0) a.SetFinal(1, 0) # Build an FST that matches 2 with no weight and everything # else adding weight 1 b = openfst.StdVectorFst() b.AddState() b.AddArc(0, 1, 1, 1, 0) b.AddArc(0, 2, 2, 0, 0) b.SetStart(0) b.SetFinal(0, 0) opts = openfst.StdRhoComposeOptions() opts.matcher2 = openfst.StdRhoMatcher(b, openfst.MATCH_INPUT, 1) # This is necessary for reasons I do not understand opts.matcher1 = openfst.StdRhoMatcher(a, openfst.MATCH_NONE) c = openfst.StdComposeFst(a, b, opts) for s in c: for arc in c.iterarcs(s): self.assertEquals(arc.nextstate, 1) if arc.ilabel == 2: self.assertEquals(arc.weight.Value(), 0) else: self.assertEquals(arc.weight.Value(), 1)
def lat2fsg(lat, fsgfile, lmfst, prune=15): if isinstance(lat, str): if lat.endswith(".slf"): dag = lattice.Dag(htkfile=lat) else: dag = lattice.Dag(lat) else: dag = lat fst = build_lattice_fsg(dag, lmfst.InputSymbols()) # Compose it (intersect, really) with the language model to get # correct N-gram scores (otherwise it is just a unigram LM). This # is the same thing as "lattice expansion". phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) cfst = openfst.StdComposeFst(fst, lmfst, opts) else: cfst = openfst.StdComposeFst(fst, lmfst) outfst = openfst.StdVectorFst(cfst) openfst.Prune(outfst, prune) # Write it back out as an FSG for PocketSphinx. build_fsg_fst(outfst, fsgfile) return outfst
def fstcompile(infile): fst = openfst.StdVectorFst() symtab = openfst.SymbolTable("symbols") symtab.AddSymbol("ε") statemap = collections.defaultdict(fst.AddState) for spam in infile: fields = spam.strip().split() if len(fields) == 1: fst.SetFinal(int(fields[0]), 0) elif len(fields) == 2: fst.SetFinal(int(fields[0]), float(fields[1])) elif len(fields) > 2: if len(fields) > 3: prob = float(fields[3]) else: prob = 1.0 if fields[2] == 'eps': fields[2] = 'ε' sym = symtab.AddSymbol(fields[2]) src = statemap[fields[0]] dest = statemap[fields[1]] fst.AddArc(src, sym, sym, -math.log(prob), dest) fst.SetStart(0) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) return fst
def lat_rescore(dag, lmfst, lw=9.5): """ Rescore a lattice using a language model FST. """ fst = lat2fsg.build_lattice_fsg(dag, lmfst.InputSymbols(), 1. / lw) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) c = openfst.StdComposeFst(fst, lmfst, opts) else: c = openfst.StdComposeFst(fst, lmfst) o = openfst.StdVectorFst() openfst.ShortestPath(c, o, 1) words = ['<s>'] st = o.Start() score = 0 while st != -1 and o.NumArcs(st): a = o.GetArc(st, 0) if a.olabel != 0: words.append(lmfst.InputSymbols().Find(a.ilabel)) score -= a.weight.Value() st = a.nextstate return words, score
def build_classfst(probdef, isyms=None): """ Build an FST from the classes in a Sphinx probability definition file. This transducer maps words to classes, and can either be composed with the input, or pre-composed with the language model. In the latter case you can project the resulting transducer to its input to obtain an equivalent non-class-based model. """ if not isinstance(probdef, SphinxProbdef): probdef = SphinxProbdef(probdef) fst = openfst.StdVectorFst() if isyms: symtab = isyms else: symtab = openfst.SymbolTable("words") symtab.AddSymbol("ε") st = fst.AddState() fst.SetStart(st) fst.SetFinal(st, 0) for word, label in symtab: if label == openfst.epsilon: continue fst.AddArc(st, label, label, 0, st) for c in probdef.classes: clabel = symtab.AddSymbol(c) for word, prob in probdef.classes[c].iteritems(): wlabel = symtab.AddSymbol(word) fst.AddArc(st, wlabel, clabel, -math.log(prob), st) fst.SetOutputSymbols(symtab) fst.SetInputSymbols(symtab) return fst
def as_openfst(self): import openfst, os tmpfile = "/tmp/%d.fst" % os.getpid() self.comp.save(tmpfile) fst = openfst.StdVectorFst() fst.Read(tmpfile) os.unlink(tmpfile) return fst
def apply_errfst(fst, errfst): sigma = errfst.InputSymbols().Find("σ") opts = openfst.StdSigmaComposeOptions() opts.matcher1 = openfst.StdSigmaMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdSigmaMatcher(errfst, openfst.MATCH_INPUT, sigma, True) cfst = openfst.StdComposeFst(fst, errfst, opts) cfst = openfst.StdVectorFst(cfst) openfst.ProjectOutput(cfst) return cfst
def testFinal(self): fst = openfst.StdVectorFst() s = [fst.AddState() for i in range(4)] fst.SetStart(s[0]) fst.SetFinal(s[3],73.0) for i in range(3): fst.AddArc(s[i],10+i,20+i,90+i,s[i+1]) assert fst.IsFinal(s[3]) assert abs(fst.FinalWeight(s[3])-73.0)<1e-10
def GEN(expr): """Generate the FST described by the expression and return it.""" fst = openfst.StdVectorFst() start = fst.AddState() fst.SetStart(start) end = fst.AddState() fst.SetFinal(end, 0.0) expr.generate(fst, start, end) return fst
def testStateIterators(self): fst = openfst.StdVectorFst() for i in range(5): fst.AddState() seen = set() for state in fst: self.assert_(state >= 0) self.assert_(state < 5) seen.add(state) self.assertEquals(len(seen), 5)
def testCreateStd(self): fst = openfst.StdVectorFst() self.assert_(fst) st = fst.AddState() self.assertEquals(st, 0) self.assertEquals(fst.NumStates(), 1) nst = fst.AddState() fst.AddArc(st, 42, 69, 55, nst) self.assertEquals(fst.NumArcs(st), 1) self.assertEquals( fst.Properties(openfst.kFstProperties, True) & openfst.kNotAcceptor, openfst.kNotAcceptor)
def testArcIterators(self): fst = openfst.StdVectorFst() st = fst.AddState() nst = fst.AddState() for i in range(5): fst.AddArc(st, i, i, i, nst) seen = set() for arc in fst.iterarcs(st): self.assertEquals(arc.nextstate, nst) self.assertEquals(arc.ilabel, arc.olabel) self.assert_(arc.weight.Value() >= 0) self.assert_(arc.weight.Value() < 5) seen.add(arc) self.assertEquals(len(seen), 5)
def decompound(self, word): tree = Tree(word) self._split(word, tree) #print(tree) nleafnodes = tree.nleafnodes() #print("Number of leaf nodes:", nleafnodes) symtablel = sorted(tree.getsyms()) symtable = dict([(s, i) for i, s in enumerate(symtablel)]) #print("Symbols:") #print(symtable) fst = openfst.StdVectorFst() [fst.AddState() for i in range(nleafnodes + 1)] fst.SetFinal(nleafnodes, 0.0) fst.SetStart(0) tree.makelattice(fst, 0, symtable, self.wordcost, firstword=True) #display for debugging # fstsymtable = openfst.SymbolTable(b"default") # for i, sym in enumerate(symtablel): # fstsymtable.AddSymbol(sym.encode("utf-8"), i) # show_fst(fst, fstsymtable) best = openfst.StdVectorFst() openfst.ShortestPath(fst, best, 1) wordseq = label_seq(best, symtablel) return wordseq
def build_dictfst(lmfst): """ Build a character-to-word FST based on the symbol table of lmfst. """ insym = openfst.SymbolTable("letters") insym.AddSymbol("ε") outsym = lmfst.InputSymbols() fst = openfst.StdVectorFst() start = fst.AddState() fst.SetStart(start) final = fst.AddState() fst.SetFinal(final, 0) for w, wsym in outsym: if wsym == 0: continue # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.AddSymbol(c) for w, wsym in outsym: if wsym == 0: continue wsym = outsym.Find(w) # Add an epsilon:word arc to the first state of this word prev = fst.AddState() fst.AddArc(start, openfst.StdArc(0, wsym, 0, prev)) # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.Find(c) next = fst.AddState() fst.AddArc(prev, openfst.StdArc(csym, 0, 0, next)) prev = next # And an epsilon arc to the final state fst.AddArc(prev, openfst.StdArc(0, 0, 0, final)) fst.SetInputSymbols(insym) fst.SetOutputSymbols(outsym) return fst
def lmfst_eval(lmfst, sent): sentfst = sent2fst(sent, openfst.StdVectorFst, lmfst.InputSymbols()) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(sentfst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) c = openfst.StdComposeFst(sentfst, lmfst, opts) else: c = openfst.StdComposeFst(sentfst, lmfst) o = openfst.StdVectorFst() openfst.ShortestPath(c, o, 1) st = o.Start() ll = 0 while st != -1 and o.NumArcs(st): a = o.GetArc(st, 0) # print o.InputSymbols().Find(a.ilabel), \ # o.OutputSymbols().Find(a.olabel), \ # -a.weight.Value() / math.log(10) ll -= a.weight.Value() st = a.nextstate return ll
def build_lmfst(lm, use_phi=False): """ Build an FST recognizer from an N-gram backoff language model. """ fst = openfst.StdVectorFst() symtab = openfst.SymbolTable("words") epsilon = symtab.AddSymbol("ε") if use_phi: phi = symtab.AddSymbol("φ") bo_label = phi else: bo_label = epsilon for ug in lm.mgrams(0): wsym = symtab.AddSymbol(ug.words[0]) fst.SetInputSymbols(symtab) fst.SetOutputSymbols(symtab) # The algorithm goes like this: # # Create a backoff state # For M in 1 to N-1: # For each M-gram w(1,M): # Create a state q(1,M) # Create an arc from state q(1,M-1) to q(1,M) with weight P(w(1,M)) # Create an arc from state q(1,M) to q(2,M) with weight bowt(w(1,M-1)) # For each N-gram w(1,N): # Create an arc from state q(1,N-1) to q(2,N) with weight P(w(1,N)) # Table holding M-gram to state mappings sidtab = {} fst.AddState() # guaranteed to be zero (we hope) for m in range(lm.get_size() - 1): add_mgram_states(fst, symtab, lm, m, sidtab, bo_label) add_ngram_arcs(fst, symtab, lm, lm.get_size(), sidtab) # Connect and arc-sort the resulting FST openfst.Connect(fst) openfst.ArcSortInput(fst) return fst
def testConvertSymbols(self): syms1 = openfst.SymbolTable("syms1") syms1.AddSymbol("ε") syms1.AddSymbol("foo", 1) syms1.AddSymbol("bar", 2) syms2 = openfst.SymbolTable("syms2") syms2.AddSymbol("ε") syms2.AddSymbol("bar", 1) syms2.AddSymbol("foo", 2) self.assertEquals(syms1.Find("foo"), 1) self.assertEquals(syms2.Find("foo"), 2) self.assertEquals(syms1.Find(1), "foo") self.assertEquals(syms2.Find(2), "foo") fst = openfst.StdVectorFst() st = fst.AddState() nst = fst.AddState() fst.AddArc(st, 1, 1, 0, nst) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 1) self.assertEquals(arc.olabel, 1) fst.SetInputSymbols(syms1) fst.SetOutputSymbols(syms1) openfst.ConvertSymbols(fst, syms2, True, True) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 2) self.assertEquals(arc.olabel, 2) openfst.ConvertSymbols(fst, syms1, True, False) arc = fst.GetArc(st, 0) self.assertEquals(arc.ilabel, 1) self.assertEquals(arc.olabel, 2) fst.AddArc(st, 42, 69, 0, nst) try: openfst.ConvertSymbols(fst, syms2, True, False) except: pass else: self.Fail("expected failure for unknown symbol")
def startFst(self): self.fst = openfst.StdVectorFst()
# Convert it to an FSM lfst = lat2fsg.build_lattice_fsg(l, rfst.OutputSymbols(), addsyms=True, determinize=False, baseword=lattice.baseword_noclass) openfst.ArcSortInput(lfst) # Apply Levenshtein model to the input errfst = LevenshteinModel(rfst.OutputSymbols()) openfst.ArcSortInput(errfst) # Apply compound word model based on the lattice compfst = CompoundWordModel(errfst.OutputSymbols(), lfst.InputSymbols()) # Precompose and project it to the lattice so compound words # are split in the alignment xlat = openfst.StdVectorFst() openfst.Compose(compfst, lfst, xlat) openfst.ProjectInput(xlat) openfst.ArcSortInput(xlat) # Compose everything together cfst = openfst.StdComposeFst(rfst, errfst) cfst = openfst.StdComposeFst(cfst, xlat) # Do bestpath search ofst = openfst.StdVectorFst() openfst.ShortestPath(cfst, ofst, 1) st = ofst.Start() err = 0 bt = [] while st != -1 and ofst.NumArcs(st): a = ofst.GetArc(st, 0) isym = ofst.InputSymbols().Find(a.ilabel)
#!/usr/bin/env python # Copyright (c) 2010 Carnegie Mellon University # # You may copy and modify this freely under the same terms as # Sphinx-III """ FST utility functions """ __author__ = "David Huggins-Daines <*****@*****.**>" __version__ = "$Revision $" import sphinxbase from . import fstutils import openfst import sys if __name__ == '__main__': lmfile, probdef = sys.argv[1:] lm = sphinxbase.NGramModel(lmfile) lmfst = fstutils.build_class_lmfst(lm, probdef, True) openfst.StdVectorFst(lmfst).Write(lmfile + ".fst")
def testAddGetString(self): fst = openfst.StdVectorFst() fst.AddString("hello") assert "hello"==openfst.GetString(fst)
def testAddString(self): fst = openfst.StdVectorFst() fst.AddString("hello") fst.AddString("world")
def build_lattice_fsg(dag, syms=None, ascale=0, pscale=0, addsyms=False, determinize=True, baseword=baseword): """ Build an FSM from a Sphinx word lattice. """ fst = openfst.StdVectorFst() if syms == None: fsgsyms = openfst.SymbolTable("words") fsgsyms.AddSymbol("ε") fsgsyms.AddSymbol("σ") fsgsyms.AddSymbol("ρ") fsgsyms.AddSymbol("φ") addsyms = True else: fsgsyms = syms statemap = {} j = 0 for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue # These should not exist, but they do (!!) if n.sym == "<s>" and n.entry != 0: continue if n not in statemap: statemap[n] = fst.AddState() if addsyms: fsgsyms.AddSymbol(baseword(n.sym)) statemap[dag.start] = fst.AddState() fst.SetStart(statemap[dag.start]) for n in dag.nodes: if n not in statemap: continue sym = fsgsyms.Find(baseword(n.sym)) for x in n.exits: if x.dest not in statemap: continue weight = 0 # Turn OOVs and non-events into epsilons if sym == -1 or n.sym == "<s>": sym = 0 if ascale: weight = -x.ascr * ascale elif pscale: weight = -x.post * pscale fst.AddArc(statemap[x.src], sym, sym, weight, statemap[x.dest]) # Add a </s> transition if none exists if '</s>' not in [x.src.sym for x in dag.end.entries]: end = fst.AddState() sym = fsgsyms.AddSymbol("</s>") fst.AddArc(statemap[dag.end], sym, sym, 0, end) fst.SetFinal(end, 0) else: fst.SetFinal(statemap[dag.end], 0) # Epsilon-remove it (like bypassing fillers...) (FIXME: umm...) openfst.RmEpsilon(fst) # Don't determinize if it's weighted if ascale or pscale: determinize = False if determinize: outfst = openfst.StdVectorFst() openfst.Determinize(fst, outfst) fst = outfst fst.SetInputSymbols(fsgsyms) fst.SetOutputSymbols(fsgsyms) return fst