def gradient(theta): write_learned_features(theta) print 'getting counts...' exp_counts = [fst.LogWeight.ZERO] * (len(f_names) + 1) obs_counts = [fst.LogWeight.ZERO] * (len(f_names) + 1) for idx, (exp_file, obs_chain_file) in enumerate(zip(exp_machines, obs_chain)): sys.stdout.write('%d \r' % idx) sys.stdout.flush() exp = fst.read(path + exp_file) obs_c = fst.read(path + obs_chain_file) exp_wt = apply_weights(exp, theta) (e_counts, o_counts) = get_counts_for_machine(exp_wt, obs_c) exp_counts = accumilate_counts(e_counts, exp_counts) obs_counts = accumilate_counts(o_counts, obs_counts) grad = np.zeros(len(theta)) for i, o in f_names: k = f_names[i, o] ok = obs_counts[k] ek = exp_counts[k] #exp(c)-exp(e) s1 = expm(-float(ok)) s2 = expm(-float(ek)) grad[k] = s1 - s2 #print grad[k], '=', s2, '-', s1, i, o #pdb.set_trace() print '\ngrad computed' return grad
def value(theta): likelihood = 0.0 print 'likelihoods' for idx, (e_file, o_chain_file) in enumerate(zip(exp_machines, obs_chain)): sys.stdout.write('%d \r' % idx) sys.stdout.flush() #print e_file e = fst.read(path + e_file) o_chain = fst.read(path + o_chain_file) likelihood += get_likelihood(e, o_chain, theta) reg = np.linalg.norm(theta, ord=1) print 'll', likelihood, 'reg', reg return likelihood
def get_likelihood(inp_file, E_file, o_chain_file, theta): inp = fst.read(path + inp_file) E = fst.read(path + E_file) o_chain = fst.read(path + o_chain_file) E_wt = apply_weights(E, theta) exp_wt = inp.compose(E_wt) #apply_weights(exp, theta) #e_wt.write('e_wt.fst', e_wt.isyms, e_wt.osyms) exp_wt = renormalize(exp_wt) #e_wt.write('e_norm.fst', e_wt.isyms, e_wt.osyms) o = exp_wt.compose(o_chain) #o.write('obs.after.fst', o.isyms, o.osyms) ll = o.shortest_distance(True)[0] return float(ll)
def main(args): L = fst.read(args.fst_file) for state in L: for arc in state: arc.weight = L.SEMIRING(0.0) L.write(args.fst_file, keep_isyms=True, keep_osyms=True)
def main(args): L = fst.read(args.fst_file) for state in L: for arc in state: arc.weight = fst.TropicalWeight(0.0) L.write(args.fst_file, keep_isyms=True, keep_osyms=True)
def main(args): L = fst.read(args.fst_file) for state in L: for arc in state: if arc.weight != fst.TropicalWeight(0.0): sys.stderr.write( "Nonzero weight in the fst: node {} arc {}".format(state, arc)) exit(1)
def get_fst(name): global FSTS if name not in FSTS: here = os.path.dirname(__file__) fstdir = os.path.join(here, "grammar", "compiled_fsts") filename = os.path.join(fstdir, "%s.bin" % name) assert os.path.exists(filename), "FST file does not exist: " + filename FSTS[name] = fst.read(filename) return FSTS[name]
def get_fst(name): global FSTS if name not in FSTS: here = os.path.dirname(__file__) fstdir = os.path.join(here, "grammar", "compiled_fsts") filename = os.path.join(fstdir, "%s.bin" % name) assert os.path.exists(filename), "FST file does not exist: " + filename FSTS[name] = fst.read(filename) FSTS[name + "_vocab"] = set(sym for sym,num in FSTS[name].isyms.items()) return FSTS[name]
def value(theta): likelihood = 0.0 print 'likelihoods' for idx, obs_trellis_file in enumerate(obs_machines): sys.stdout.write('%d \r' % idx) sys.stdout.flush() obs_trellis = fst.read(path + obs_trellis_file) likelihood += get_likelihood(obs_trellis, theta) #reg = np.linalg.norm(theta, ord=1) print 'll', likelihood #, 'reg', reg return likelihood
def posibilities(path, words): ''' Return list of posibilities for every word in fstVector ''' fstVector = fst.read(path); fstVector.remove_epsilon() posibilities = []; _posibilities(fstVector,fstVector.start, words, posibilities, 0) for i in posibilities: # remove empty sets if len(i) == 0: posibilities.remove(i) return posibilities
def generate_suggestions(prefix): """ To extract suggestions the first step was to traverse the fst in fstfile following the charecters of the given prefix. From there the state of the final letter of prefix is saved and the next part constructs an fst of the branch the grows from the saved state. It is done in bds approach. Later, extract all paths from acceptor in a dfs manner is done with path weight calculation. Then all paths are sorted by weights and the first three are jsoned. INPUT: a string OUTPUT: a json file with up to three values for Suggestion entry """ fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst" sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms") lm = fst.read(fstfile) prefix = prefix.lower() # look for subtree given prefix stateid = 0 for ch in prefix: state = lm[stateid] for arc in state.arcs: if sym.find(arc.ilabel)==ch: print ch stateid = arc.nextstate break # construct desired subtree (bds) reduced = bfs(stateid, lm, sym) # read strings (dfs) top3 = dfs(reduced, sym) # take first three (if exists) suggest = [] for (suffix, _) in top3: suggest.append(suffix) # dict it result = {} result["Suggestions:"] = suggest # json it json_file = "auto.json" with open(json_file, "w") as fp: json.dump(result, fp)
def generate_suggestions(prefix): """ To extract suggestions the first step was to traverse the fst in fstfile following the charecters of the given prefix. From there the state of the final letter of prefix is saved and the next part constructs an fst of the branch the grows from the saved state. It is done in bds approach. Later, extract all paths from acceptor in a dfs manner is done with path weight calculation. Then all paths are sorted by weights and the first three are jsoned. INPUT: a string OUTPUT: a json file with up to three values for Suggestion entry """ fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst" sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms") lm = fst.read(fstfile) prefix = prefix.lower() # look for subtree given prefix stateid = 0 for ch in prefix: state = lm[stateid] for arc in state.arcs: if sym.find(arc.ilabel) == ch: print ch stateid = arc.nextstate break # construct desired subtree (bds) reduced = bfs(stateid, lm, sym) # read strings (dfs) top3 = dfs(reduced, sym) # take first three (if exists) suggest = [] for (suffix, _) in top3: suggest.append(suffix) # dict it result = {} result["Suggestions:"] = suggest # json it json_file = "auto.json" with open(json_file, "w") as fp: json.dump(result, fp)
def main(args): L = fst.read(args.fst_file) for state in L: ilab = [] for arc in state: ilab.append(arc.ilabel) ilabs = set(ilab) if 0 in ilabs and len(ilab) != 1: sys.stderr.write( "Node {} has a non-epsilon arc that is not unique: {}".format( state, ilab)) exit(1) if len(ilabs) != len(ilab): sys.stderr.write( "Node {} has duplicated ilabels on edges: {}".format( state, ilab)) exit(1)
def main(args): L = fst.read(args.fst_file) for state in L: ilab = [] for arc in state: ilab.append(arc.ilabel) ilabs = set(ilab) if 0 in ilabs and len(ilab) != 1: sys.stderr.write( "Node {} has a non-epsilon arc that is not unique: {}" .format(state, ilab)) exit(1) if len(ilabs) != len(ilab): sys.stderr.write( "Node {} has duplicated ilabels on edges: {}" .format(state, ilab)) exit(1)
def load_lat(fn): lat = fst.read(fn) lat = fst.StdVectorFst(lat) return lat
learned_weights = dict( (int(l.split('\t')[0]), float(l.split('\t')[-1])) for l in codecs.open(learned_weight_file, 'r', 'utf-8').readlines()) filenames = codecs.open(path + 'filenames', 'r', 'utf-8').readlines()[1:] nat_sort_filenames = natural_sort(filenames) inp_machines, obs_chain, exp_machines = zip(*[tuple(l.split()) for l in nat_sort_filenames]) obs_trelis = [o.replace('y', 'obs') for o in obs_chain] source = [l.split() for l in codecs.open(path + 'en', 'r', 'utf-8').readlines()] target = [l.split() for l in codecs.open(path + 'fr', 'r', 'utf-8').readlines()] all_alignments = [] for idx, (ot, s, t) in enumerate(zip(obs_trelis, source, target)[:53]): print idx, s, t obs_t = fst.read(path + ot) sym_features = obs_t.isyms sym_targets = obs_t.osyms print path + ot obs_t.write('obs_t.fst') obs_wt = apply_weights(obs_t, learned_weights) obs_wt.write('obs_wt.fst', obs_t.isyms, obs_t.osyms) os.system('fstmap --map_type="to_standard" obs_wt.fst > obs_wt.std.fst') obs_wt_std = fst.read('obs_wt.std.fst') best_path = obs_wt_std.shortest_path() best_path.write('best_path.fst', obs_t.isyms, obs_t.osyms) all_alignments += do_align(idx + 1, best_path, s, t) writer = codecs.open('never.gonna.work.20.alignments.out', 'w') writer.write('\n'.join(all_alignments)) writer.flush() writer.close()
def __init__(self, path): self.path = path self.fst = fst.read(self.path) self.isyms = dict(self.fst.isyms.items())
c.start = c.add_state() space_id = syms["<space>"] c.add_arc(0, 0, space_id, syms["<eps>"]) c.add_arc(0, 0, space_id, syms["+C+"]) c.add_arc(0, 0, space_id, syms["+D+"]) for word_id in word_ids: c.add_arc(0, 0, word_id, word_id) c[0].final = True return c if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: %s G.fst words.txt" % sys.argv[0], file=sys.stderr) g = fst.read(sys.argv[1]) syms = {} syms_list = [] for l in open(sys.argv[2]): ss = l.split() syms[ss[0]] = int(ss[1]) syms_list.append(ss[0]) unk_id = syms["<unk>"] # Following is needed to avoid line buffering while 1: l = sys.stdin.readline() if not l: break unks = [] words = l.split()
c.start = c.add_state() space_id = syms["<space>"] c.add_arc(0, 0, space_id, syms["<eps>"]) c.add_arc(0, 0, space_id, syms["+C+"]) c.add_arc(0, 0, space_id, syms["+D+"]) for word_id in word_ids: c.add_arc(0, 0, word_id, word_id) c[0].final = True return c if __name__ == '__main__': if len(sys.argv) != 3: print >> sys.stderr, "Usage: %s G.fst words.txt" % sys.argv[0] g = fst.read(sys.argv[1]) syms = {} syms_list = [] for l in open(sys.argv[2]): ss = l.split() syms[ss[0]] = int(ss[1]) syms_list.append(ss[0]) unk_id = syms["<unk>"] # Following is needed to avoid line buffering while 1: l = sys.stdin.readline() if not l: break unks = [] words = l.split()
import graphviz import pydot # import pywrapfst import fst import nltk import re import os LexM = fst.read("lex_model/lex-uw.fst") LM_expr = "^(.*)\.pru$" folder_name = "lang_model" file_list = [os.path.join(folder_name, fname) for fname in os.listdir(folder_name)] pruned_models = [re.match(LM_expr, filename).group(1) for filename in file_list if re.match(LM_expr, filename)] i_table = LexM.isyms o_table = LexM.osyms mod_name = "lang_model/3-gram-3" LG = fst.read(mod_name + ".pi").copy() test_word = fst.Acceptor(syms=i_table) test_word.add_arc(0, 1, 'HH') test_word.add_arc(1, 2, 'EY') test_word[2].final = True test_comp = test_word >> LG
__author__ = 'arenduchintala' import sys, fst def parseargs(args): try: in_fst_path = args[args.index('-in') + 1] out_fst_path = args[args.index('-out') + 1] n = int(args[args.index('-n') + 1]) return [in_fst_path, out_fst_path, n] except (ValueError, IndexError): sys.stderr.write('Usage: -in [name of fst(final)] -out [name of shortest path fst] -n [number of paths]') exit() if __name__ == '__main__': [in_fst, out_fst, n] = parseargs(sys.argv) sym_f = fst.read_symbols('data/symf.bin') sym_e = fst.read_symbols('data/syme.bin') f = fst.read(in_fst) sp = f.shortest_path(n) sp.remove_epsilon() sp.write(out_fst, sym_f, sym_e)
def load(self): self.fst = fst.read(self.path) self.isyms = dict(self.fst.isyms.items())