def parse_cube(lang_name, finite): """ reads hypotheses of lang_name, estimates the p/r and posterior score, and saves them into a cube (list of tables) data structure: stats = [cube, topn] cube = [[size, Z, table], [size, Z, table], ...] table = [[ind, score, p, r, f, strs], [ind, score, p, r, f, strs], ...] NOTE: topn here is a dict, you can use ind to find the h example script: mpiexec -n 12 python parse_hypothesis.py --mode=parse_cube --language=An --finite=3/10 """ _dir = 'out/' global size global rank topn = dict() prf_dict = {} language = instance(lang_name, finite) if rank == 0: truncate_flag = False if (lang_name == 'An' and finite <= 3) else True set_topn = set() print 'loading..' fff() for file_name in listdir(_dir): if lang_name + '_' in file_name: _set = load(open(_dir + file_name)) set_topn.update([h for h in _set]) print 'getting p&r..' fff() pr_data = language.sample_data_as_FuncData(2048) for h in set_topn: p, r, h_llcounts = language.estimate_precision_and_recall( h, pr_data, truncate=truncate_flag) prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)] h.fixed_ll_counts = h_llcounts topn = dict(enumerate(set_topn)) print 'bcasting..' fff() topn = comm.bcast(topn, root=0) prf_dict = comm.bcast(prf_dict, root=0) print rank, 'getting posterior' fff() # work_list = slice_list(np.arange(0, 72, 6), size) work_list = slice_list(np.arange(120, 264, 12), size) cube = [] for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for ind, h in topn.iteritems(): h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for ind, h in topn.iteritems()]) table = [[ ind, h.posterior_score, prf_dict[h][0], prf_dict[h][1], prf_dict[h][2], h.fixed_ll_counts ] for ind, h in topn.iteritems()] table.sort(key=lambda x: x[1], reverse=True) cube += [[s, Z, table]] print rank, s, 'done' fff() if rank == 0: for i in xrange(1, size): cube += comm.recv(source=i) else: comm.send(cube, dest=0) print rank, 'table sent' fff() sys.exit(0) cube.sort(key=lambda x: x[0]) dump([cube, topn], open(lang_name + '_stats' + suffix, 'w'))
prefix = 'out/' # prefix = '/home/lijm/WORK/yuan/lot/' suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime()) # set the output codec -- needed to display lambda to stdout sys.stdout = codecs.getwriter('utf8')(sys.stdout) if rank == 0: display_option_summary(options) fff() # you need to run 12 machine on that DATA_RANGE = np.arange(0, 70, 6) # DATA_RANGE = np.arange(70, 140, 6) language = instance(options.LANG, options.FINITE) args = list(itertools.product([make_hypothesis], [language], DATA_RANGE)) # run on MPI # results = MPI_map(run, args) hypotheses = simple_mpi_map(run, args) # ======================================================================================================== # Get stats # ======================================================================================================== # dump(hypotheses, open(prefix+'hypotheses_'+options.LANG+suffix, 'w')) # # get precision and recall for h # pr_data = language.sample_data_as_FuncData(1024) # p = [] # r = []
def make_staged_posterior_seq(_dir, temperature, lang_name, dtype): """ script: python parse_hypothesis.py --mode=make_staged_posterior_seq --file=file --temp=1 --language=AnBn --dtype=staged/uniform 1. read raw file 2. compute fixed Counter 3. compute posterior for different amounts dumped posterior format: [topn, [z,amount,finite,[s1,s2,....]], [], [], ....] NOTE: if _dir is previously dumped posterior seq, then we use it """ if not (os.path.isfile(_dir) and 'posterior_seq' in _dir): topn = set() for filename in os.listdir(_dir): if ('staged' in filename or 'normal' in filename) and 'seq' not in filename: print 'load', filename _set = load(open(_dir + filename)) topn.update([h for h in _set]) topn = list(topn) # fix the llcnts to save time and make curve smooth print 'get llcnts...' for h in topn: llcnts = Counter([h() for _ in xrange(2048)]) h.fixed_ll_counts = llcnts seq = [] seq.append(topn) for amount, finite in mk_staged_wlist(0, 200, 2, [48, 96]): print 'posterior on', amount, finite if dtype == 'staged': language = instance(lang_name, finite) eval_data = language.sample_data_as_FuncData(amount) elif dtype == 'uniform': eval_data = uniform_data(amount, 12) for h in topn: h.likelihood_temperature = temperature h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) seq.append([Z, amount, finite, [h.posterior_score for h in topn]]) dump(seq, open(dtype + '_posterior_seq' + suffix, 'w')) else: seq = load(open(_dir)) # ====================== compute KL based on seq ======================= print 'compute kl seq...' kl_seq = [] topn = seq.pop(0) for i in xrange(len(seq) - 1): kl_seq.append([seq[i][1], compute_kl2(seq[i], seq[i + 1])]) dump(kl_seq, open(dtype + '_kl_seq' + suffix, 'w'))
def parse_plot(lang_name, finite, is_plot): """ run: mpi supported example: mpiexec -n 12 python parse_hypothesis.py --mode=parse_plot --language=An --finite=3 --plot=yes --wfs=yes """ _dir = 'out/final/' global size global rank topn = set() prf_dict = {} language = instance(lang_name, finite) if rank == 0: print 'loading..' fff() for file_name in listdir(_dir): if lang_name + '_' in file_name: _set = load(open(_dir + file_name)) topn.update([h for h in _set]) print 'getting p&r..' fff() pr_data = language.sample_data_as_FuncData(1024) for h in topn: p, r = language.estimate_precision_and_recall(h, pr_data) prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)] dump(prf_dict, open(lang_name + '_prf_dict' + suffix, 'w')) topn = comm.bcast(topn, root=0) prf_dict = comm.bcast(prf_dict, root=0) print rank, 'getting posterior' fff() work_list = slice_list(np.arange(235, 300, 5), size) seq = [] pnt_str = 'Weighted F-score' if options.WFS == 'yes' else 'Posterior Probability' for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for h in topn: h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) if options.WFS == 'yes': tmp = sum( [prf_dict[h][2] * np.exp(h.posterior_score - Z) for h in topn]) # TODO # else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if prf_dict[h][2] > 0.9]) else: tmp = sum([ np.exp(h.posterior_score - Z) for h in topn if (prf_dict[h][0] < 0.3 and prf_dict[h][1] > 0.9) ]) if options.PROB == 'yes': dump([topn, Z], open(lang_name + '_prob_' + str(s) + suffix, 'w')) seq.append([s, tmp]) print 'size: %.1f' % s, '%s: %.2f' % (pnt_str, tmp) fff() #debug _list = [h for h in topn] _list.sort(key=lambda x: x.posterior_score, reverse=True) for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2], print Counter([_list[i]() for _ in xrange(256)]) print _list[i] print '=' * 50 fff() if rank == 0: for i in xrange(1, size): seq += comm.recv(source=i) else: comm.send(seq, dest=0) sys.exit(0) seq.sort(key=lambda x: x[0]) dump(seq, open(lang_name + '_seq' + suffix, 'w')) if is_plot == 'yes': x, y = zip(*seq) plt.plot(x, y) plt.ylabel(pnt_str) plt.xlabel('Size of Data') plt.title(lang_name) plt.show()
def parse_cube(lang_name, finite): """ reads hypotheses of lang_name, estimates the p/r and posterior score, and saves them into a cube (list of tables) data structure: stats = [cube, topn] cube = [[size, Z, table], [size, Z, table], ...] table = [[ind, score, p, r, f, strs], [ind, score, p, r, f, strs], ...] NOTE: topn here is a dict, you can use ind to find the h example script: mpiexec -n 12 python parse_hypothesis.py --mode=parse_cube --language=An --finite=3/10 """ _dir = 'out/' global size global rank topn = dict() prf_dict = {} language = instance(lang_name, finite) if rank == 0: truncate_flag = False if (lang_name == 'An' and finite <= 3) else True set_topn = set() print 'loading..'; fff() for file_name in listdir(_dir): if lang_name + '_' in file_name: _set = load(open(_dir+file_name)) set_topn.update([h for h in _set]) print 'getting p&r..'; fff() pr_data = language.sample_data_as_FuncData(2048) for h in set_topn: p, r, h_llcounts = language.estimate_precision_and_recall(h, pr_data, truncate=truncate_flag) prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)] h.fixed_ll_counts = h_llcounts topn = dict(enumerate(set_topn)) print 'bcasting..'; fff() topn = comm.bcast(topn, root=0) prf_dict = comm.bcast(prf_dict, root=0) print rank, 'getting posterior'; fff() # work_list = slice_list(np.arange(0, 72, 6), size) work_list = slice_list(np.arange(120, 264, 12), size) cube = [] for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for ind, h in topn.iteritems(): h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for ind, h in topn.iteritems()]) table = [[ind, h.posterior_score, prf_dict[h][0], prf_dict[h][1], prf_dict[h][2], h.fixed_ll_counts] for ind, h in topn.iteritems()] table.sort(key=lambda x: x[1], reverse=True) cube += [[s, Z, table]] print rank, s, 'done'; fff() if rank == 0: for i in xrange(1, size): cube += comm.recv(source=i) else: comm.send(cube, dest=0) print rank, 'table sent'; fff() sys.exit(0) cube.sort(key=lambda x: x[0]) dump([cube, topn], open(lang_name+'_stats'+suffix, 'w'))
mpiexec -n 12 python my_search_stp.py --language=Dyck --finite=8 --N=2 --terminal=b --bound=7 --steps=100000 mpiexec -n 12 python my_search_stp.py --language=SimpleEnglish --finite=8 --N=3 --bound=5 --steps=100000 """ # ======================================================================================================== # Process command line arguments / # ======================================================================================================== fff = sys.stdout.flush parser = OptionParser() parser.add_option("--language", dest="LANG", type="string", default='An', help="name of a language") parser.add_option("--steps", dest="STEPS", type="int", default=40000, help="Number of samples to run") parser.add_option("--top", dest="TOP_COUNT", type="int", default=20, help="Top number of hypotheses to store") parser.add_option("--finite", dest="FINITE", type="int", default=10, help="specify the max_length to make language finite") parser.add_option("--name", dest="NAME", type="string", default='', help="name of file") parser.add_option("--N", dest="N", type="int", default=3, help="number of inner hypotheses") parser.add_option("--terminal", dest="TERMINALS", type="string", default='', help="extra terminals") parser.add_option("--bound", dest="BOUND", type="int", default=5, help="recursion bound") (options, args) = parser.parse_args() prefix = 'out/' # prefix = '/home/lijm/WORK/yuan/lot/' suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime()) sys.stdout = codecs.getwriter('utf8')(sys.stdout) if rank == 0: display_option_summary(options); fff() DATA_RANGE = np.arange(0, 70, 6) language = instance(options.LANG, options.FINITE) args = list(itertools.product([make_hypothesis], [language], DATA_RANGE)) hypotheses = simple_mpi_map(run, args)
def parse_plot(lang_name, finite, is_plot): """ run: mpi supported example: mpiexec -n 12 python parse_hypothesis.py --mode=parse_plot --language=An --finite=3 --plot=yes --wfs=yes """ _dir = 'out/final/' global size global rank topn = set() prf_dict = {} language = instance(lang_name, finite) if rank == 0: print 'loading..'; fff() for file_name in listdir(_dir): if lang_name + '_' in file_name: _set = load(open(_dir+file_name)) topn.update([h for h in _set]) print 'getting p&r..'; fff() pr_data = language.sample_data_as_FuncData(1024) for h in topn: p, r = language.estimate_precision_and_recall(h, pr_data) prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)] dump(prf_dict, open(lang_name+'_prf_dict'+suffix, 'w')) topn = comm.bcast(topn, root=0) prf_dict = comm.bcast(prf_dict, root=0) print rank, 'getting posterior'; fff() work_list = slice_list(np.arange(235, 300, 5), size) seq = [] pnt_str = 'Weighted F-score' if options.WFS == 'yes' else 'Posterior Probability' for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for h in topn: h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) if options.WFS == 'yes': tmp = sum([prf_dict[h][2]*np.exp(h.posterior_score - Z) for h in topn]) # TODO # else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if prf_dict[h][2] > 0.9]) else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if (prf_dict[h][0] < 0.3 and prf_dict[h][1] > 0.9)]) if options.PROB == 'yes': dump([topn, Z], open(lang_name+'_prob_'+str(s)+suffix, 'w')) seq.append([s, tmp]) print 'size: %.1f' % s, '%s: %.2f' % (pnt_str, tmp); fff() #debug _list = [h for h in topn]; _list.sort(key=lambda x: x.posterior_score, reverse=True) for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2], print Counter([_list[i]() for _ in xrange(256)]) print _list[i] print '='*50; fff() if rank == 0: for i in xrange(1, size): seq += comm.recv(source=i) else: comm.send(seq, dest=0) sys.exit(0) seq.sort(key=lambda x: x[0]) dump(seq, open(lang_name+'_seq'+suffix, 'w')) if is_plot == 'yes': x, y = zip(*seq) plt.plot(x, y) plt.ylabel(pnt_str) plt.xlabel('Size of Data') plt.title(lang_name) plt.show()
def make_staged_posterior_seq(_dir, temperature, lang_name, dtype): """ script: python parse_hypothesis.py --mode=make_staged_posterior_seq --file=file --temp=1 --language=AnBn --dtype=staged/uniform 1. read raw file 2. compute fixed Counter 3. compute posterior for different amounts dumped posterior format: [topn, [z,amount,finite,[s1,s2,....]], [], [], ....] NOTE: if _dir is previously dumped posterior seq, then we use it """ if not (os.path.isfile(_dir) and 'posterior_seq' in _dir): topn = set() for filename in os.listdir(_dir): if ('staged' in filename or 'normal' in filename) and 'seq' not in filename: print 'load', filename _set = load(open(_dir+filename)) topn.update([h for h in _set]) topn = list(topn) # fix the llcnts to save time and make curve smooth print 'get llcnts...' for h in topn: llcnts = Counter([h() for _ in xrange(2048)]) h.fixed_ll_counts = llcnts seq = [] seq.append(topn) for amount, finite in mk_staged_wlist(0,200,2,[48,96]): print 'posterior on', amount, finite if dtype == 'staged': language = instance(lang_name, finite) eval_data = language.sample_data_as_FuncData(amount) elif dtype == 'uniform': eval_data = uniform_data(amount, 12) for h in topn: h.likelihood_temperature = temperature h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) seq.append([Z, amount, finite, [h.posterior_score for h in topn]]) dump(seq,open(dtype + '_posterior_seq' + suffix, 'w')) else: seq = load(open(_dir)) # ====================== compute KL based on seq ======================= print 'compute kl seq...' kl_seq = [] topn = seq.pop(0) for i in xrange(len(seq)-1): kl_seq.append([seq[i][1],compute_kl2(seq[i],seq[i+1])]) dump(kl_seq,open(dtype + '_kl_seq' + suffix,'w'))
def parse_plot(lang_name, finite, is_plot): """ run: mpi supported """ _dir = 'out/final/' global size global rank print 'loading..' fff() topn = set() for file_name in listdir(_dir): if lang_name in file_name: _set = load(open(_dir + file_name)) topn.update([h for h in _set]) language = instance(lang_name, finite) print 'getting p&r..' fff() prf_dict = {} pr_data = language.sample_data_as_FuncData(1024) for h in topn: p, r = language.estimate_precision_and_recall(h, pr_data) prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)] if rank == 0: dump(prf_dict, open(lang_name + '_prf_dict' + suffix, 'w')) print rank, 'getting posterior' fff() work_list = slice_list(np.arange(0, 30, 1), size) seq = [] for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for h in topn: h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) wfs = sum( [prf_dict[h][2] * np.exp(h.posterior_score - Z) for h in topn]) seq.append([s, wfs]) print 'size: %.1f' % s, 'weighted F-score: %.2f' % wfs fff() #debug _list = [h for h in topn] _list.sort(key=lambda x: x.posterior_score, reverse=True) for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2], print Counter([_list[i]() for _ in xrange(256)]) print _list[i] print '=' * 50 fff() if rank == 0: for i in xrange(1, size): seq += comm.recv(source=i) else: comm.send(seq, dest=0) sys.exit(0) seq.sort(key=lambda x: x[0]) dump(seq, open(lang_name + '_seq' + suffix, 'w')) if is_plot == 'yes': x, y = zip(*seq) plt.plot(x, y) plt.ylabel('Weighted F-score') plt.xlabel('Size of Data') plt.title(lang_name) plt.show()
def parse_plot(lang_name, finite, is_plot): """ run: mpi supported """ _dir = 'out/final/' global size global rank print 'loading..'; fff() topn = set() for file_name in listdir(_dir): if lang_name in file_name: _set = load(open(_dir+file_name)) topn.update([h for h in _set]) language = instance(lang_name, finite) print 'getting p&r..'; fff() prf_dict = {} pr_data = language.sample_data_as_FuncData(1024) for h in topn: p, r = language.estimate_precision_and_recall(h, pr_data) prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)] if rank == 0: dump(prf_dict, open(lang_name+'_prf_dict'+suffix, 'w')) print rank, 'getting posterior'; fff() work_list = slice_list(np.arange(0, 30, 1), size) seq = [] for s in work_list[rank]: eval_data = language.sample_data_as_FuncData(s) for h in topn: h.likelihood_temperature = 100 h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) wfs = sum([prf_dict[h][2]*np.exp(h.posterior_score - Z) for h in topn]) seq.append([s, wfs]) print 'size: %.1f' % s, 'weighted F-score: %.2f' % wfs; fff() #debug _list = [h for h in topn]; _list.sort(key=lambda x: x.posterior_score, reverse=True) for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2], print Counter([_list[i]() for _ in xrange(256)]) print _list[i] print '='*50; fff() if rank == 0: for i in xrange(1, size): seq += comm.recv(source=i) else: comm.send(seq, dest=0) sys.exit(0) seq.sort(key=lambda x: x[0]) dump(seq, open(lang_name+'_seq'+suffix, 'w')) if is_plot == 'yes': x, y = zip(*seq) plt.plot(x, y) plt.ylabel('Weighted F-score') plt.xlabel('Size of Data') plt.title(lang_name) plt.show()
parser.add_option("--top", dest="TOP_COUNT", type="int", default=20, help="Top number of hypotheses to store") parser.add_option("--finite", dest="FINITE", type="int", default=10, help="specify the max_length to make language finite") parser.add_option("--name", dest="NAME", type="string", default='', help="name of file") (options, args) = parser.parse_args() suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime()) # set the output codec -- needed to display lambda to stdout sys.stdout = codecs.getwriter('utf8')(sys.stdout) if is_master_process(): display_option_summary(options); fff() # you need to run 12 machine on that DATA_RANGE = np.arange(1, 64, 6) language = instance(options.LANG) args = list(itertools.product([make_hypothesis], [language], DATA_RANGE, [options.FINITE])) # run on MPI results = MPI_map(run, args) # ======================================================================================================== # Get stats # ======================================================================================================== # collapse all returned sets hypotheses = set() for r in results: hypotheses.update(r) # add the ith's results to the set dump(hypotheses, open('hypotheses'+suffix, 'w')) # get precision and recall for h