FunctionData(input=[], output={ 'h e s': size, 'm e s': size, 'm e g': size, 'h e g': size, 'm e n': size, 'h e m': size, 'm e k': size, 'k e s': size, 'h e k': size, 'k e N': size, 'k e g': size, 'h e n': size, 'm e N': size, 'k e n': size, 'h e N': size, 'f e N': size, 'g e N': size, 'n e N': size, 'n e s': size, 'f e n': size, 'g e n': size, 'g e m': size, 'f e m': size, 'g e k': size, 'f e k': size, 'f e g': size, 'f e s': size, 'n e g': size, 'k e m': size, 'n e m': size, 'g e s': size, 'n e k': size })
while (i < len(categories)) and (phase[i] != 'g'): selected_so_far.append(selected[i]) categories_so_far[selected[i]] = categories[selected[i]] i += 1 if ((i < len(categories)) and (i > 0)): stim = "".join(categories_so_far) all_stim.append(stim) all_cats.append(categories) all_obs.append(observed_cats) tot_guess += len(categories) - i #print selected_so_far # print stim # print data = [FunctionData(alpha=1. - 1e-7, input=all_C, output=stim)] # output="".join(categories))] #output=stim)] h0 = MyHypothesis() MAP = None best_post = -float("inf") best_out = "" n_comp = n_compatible(stim, concepts) print stim, n_comp while n_compatible(stim, concepts) < 2:
def make_data(N=20, f=TargetConcepts[0]): data = [] for _ in xrange(N): o = sample_one(all_objects) data.append(FunctionData(input=[o], output=f(o), alpha=0.90)) return data
def make_staged_seq2(jump, temp): """ run: mpiexec -n 12 """ rec = load_hypo('out/simulations/staged/', ['staged', 'normal0', 'normal1']) seen = set() work_list = slice_list(range(size), 3) for e in rec: for h in e[1]: if h in seen: continue seen.add(h) if rank in work_list[0]: seq = [] infos = [[i, min(4 * ((int(i) - 1) / 48 + 1), 12)] for i in [10**e for e in np.arange(0, 2.2, 0.1)]] for e in infos: prob_dict = {} language = AnBn(max_length=e[1] + (e[1] % 2 != 0)) eval_data = language.sample_data_as_FuncData(e[0]) for h in seen: h.likelihood_temperature = temp prob_dict[h] = h.compute_posterior(eval_data) seq.append(prob_dict) print 'rank: ', rank, e, 'done' fff() elif rank in work_list[1]: seq = [] infos = [[i, 12] for i in [10**e for e in np.arange(0, 2.2, 0.1)]] for e in infos: prob_dict = {} language = AnBn(max_length=e[1]) eval_data = language.sample_data_as_FuncData(e[0]) for h in seen: h.likelihood_temperature = temp prob_dict[h] = h.compute_posterior(eval_data) seq.append(prob_dict) print 'rank: ', rank, e, 'done' fff() else: seq = [] infos = [[i, 12] for i in [10**e for e in np.arange(0, 2.2, 0.1)]] for e in infos: prob_dict = {} eval_data = uniform_data(e[0], e[1]) for h in seen: h.likelihood_temperature = temp prob_dict[h] = h.compute_posterior(eval_data) seq.append(prob_dict) print 'rank: ', rank, e, 'done' fff() # TODO no need ? from copy import deepcopy dict_0 = deepcopy(seq[0]) for h in dict_0: dict_0[h] = h.compute_posterior( [FunctionData(input=[], output=Counter())]) seq.insert(0, dict_0) dump(seq, open('seq' + str(rank) + suffix, 'w'))
def make_pos2(jump, temp): """ 1. read raw output 2. compute precision & recall on nonadjacent and adjacent contents 3. evaluate posterior probability on different data sizes 4. dump the sequence run: mpiexec -n 4 """ print 'loading..' fff() rec = load_hypo('out/simulations/nonadjacent/', ['0']) # TODO one do this print 'estimating pr' fff() pr_dict = {} _set = set() cnt_tmp = {} for e in rec: for h in e[1]: if h in _set: continue cnt = Counter([h() for _ in xrange(1024)]) cnt_tmp[h] = cnt base = sum(cnt.values()) num = 0 for k, v in cnt.iteritems(): if k is None or len(k) < 2: continue if k[0] + k[-1] in ['ab', 'cd', 'ef']: num += v pr_dict[h] = float(num) / base # fix the h_output h.h_output = cnt _set.add(h) work_list = range(2, 17, jump) for i in work_list: language = LongDependency(max_length=i) eval_data = {} for e in language.str_sets: eval_data[e] = 144.0 / len(language.str_sets) eval_data = [FunctionData(input=[], output=eval_data)] score = np.zeros(len(_set), dtype=np.float64) prec = np.zeros(len(_set), dtype=np.float64) # prob_dict = {} # test_list = [] for ind, h in enumerate(_set): h.likelihood_temperature = temp score[ind] = h.compute_posterior(eval_data) prec[ind] = pr_dict[h] # prob_dict[h] = h.compute_posterior(eval_data) # test_list.append([h.posterior_score, pr_dict[h], cnt_tmp[h], str(h), h]) # test_list.sort(key=lambda x: x[0], reverse=True) # Z = logsumexp([h.posterior_score for h in _set]) # # weighted_axb = sum([np.exp(e[0] - Z) * e[1] for e in test_list]) # print i, weighted_axb # for i_t in xrange(3): # print 'prob: ', np.exp(test_list[i_t][0] - Z), 'axb_f-score', test_list[i_t][1] # print test_list[i_t][2] # # print test_list[i_t][4].compute_posterior(eval_data) # # print language.estimate_precision_and_recall(test_list[i_t][5], cnt_tmp[test_list[i_t][5]]) # print '='*50 # fff() # # f = open('non_w'+suffix, 'a') # print >> f, Z, weighted_axb # print # f.close() # # print 'size: %i' % i, Z, weighted_axb; fff() if rank != 0: comm.send(score, dest=0) comm.send(prec, dest=0) sys.exit(0) else: for r in xrange(size - 1): score += comm.recv(source=r + 1) prec += comm.recv(source=r + 1) score /= size prec /= size Z = logsumexp(score) weighted_axb = np.sum(np.exp(score - Z) * prec) f = open('non_w' + suffix, 'a') print >> f, Z, weighted_axb print i, Z, weighted_axb fff() f.close()
#for t in generate_trees(grammar): #print t # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set up data -- true output means attraction (p=positive; n=negative) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data = [] for a, b in itertools.product(objects, objects): myinput = [a, b] # opposites (n/p) interact; x interacts with nothing myoutput = (a[0] != b[0]) and (a[0] != 'x') and (b[0] != 'x') data.append(FunctionData(input=myinput, output=myoutput)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run mcmc # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib.Proposals.RegenerationProposal import RegenerationProposal #mp = MixtureProposal([RegenerationProposal(grammar), InsertDeleteProposal(grammar)] ) mp = RegenerationProposal(grammar) from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis h0 = LOTHypothesis( grammar, args=['x', 'y'], ALPHA=0.999, proposal_function=mp ) # alpha here trades off with the amount of data. Currently assuming no noise, but that's not necessary
def make_data(n=1, alpha=0.99): data = [] for x in xrange(1, 10): data.append( FunctionData(input=['even', x], output=(x % 2 == 0), alpha=alpha) ) data.append( FunctionData(input=['odd', x], output=(x % 2 == 1), alpha=alpha) ) return data*n
# Build up the info about the data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.DataAndObjects import FunctionData L = [] # each hypothesis's cumulative likelihood to each data point GroupLength = [] NYes = [] NTrials = [] Output = [] domain = range(1, 101) for os in observed_sets: datum = FunctionData(input=[], output=os, alpha=ALPHA) # compute the likelihood for all the data here for h in hypotheses: h.cached_set = h() h.stored_likelihood = h.compute_single_likelihood( datum, cached_set=h.cached_set) L.append([h.stored_likelihood for h in hypotheses]) # each likelihood gl = 0 # how many did we actually ad? for i in domain: k = tuple([os, i]) if k in human_nyes and k in human_ntrials: gl += 1
# -*- coding: utf-8 -*- from LOTlib.Hypotheses.GaussianLOTHypothesis import GaussianLOTHypothesis from LOTlib.DataAndObjects import FunctionData from LOTlib.FiniteBestSet import FiniteBestSet from LOTlib.Inference.MetropolisHastings import mh_sample from LOTlib.Miscellaneous import qq from Grammar import grammar """ This uses Galileo's data on a falling ball. See: http://www.amstat.org/publications/jse/v3n1/datasets.dickey.html See also, Jeffreys, W. H., and Berger, J. O. (1992), "Ockham's Razor and Bayesian Analysis," American Scientist, 80, 64-72 (Erratum, p. 116). """ # NOTE: these must be floats, else we get hung up on powers of ints data_sd = 50.0 data = [ FunctionData(input=[1000.], output=1500., ll_sd=data_sd), FunctionData(input=[828.], output=1340., ll_sd=data_sd), FunctionData(input=[800.], output=1328., ll_sd=data_sd), FunctionData(input=[600.], output=1172., ll_sd=data_sd), FunctionData(input=[300.], output=800., ll_sd=data_sd), FunctionData(input=[0.], output=0., ll_sd=data_sd) # added 0,0 since it makes physical sense. ] CHAINS = 10 STEPS = 10000000 SKIP = 0 PRIOR_TEMPERATURE = 1.0 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Define the grammar
def make_data(n=1, alpha=0.99, *args, **kwargs): # Set up data -- true output means attraction (p=positive; n=negative) return [ FunctionData(input=["p1", "n1"], output=True, alpha=alpha), FunctionData(input=["p1", "n2"], output=True, alpha=alpha), FunctionData(input=["p1", "p1"], output=False, alpha=alpha), FunctionData(input=["p1", "p2"], output=False, alpha=alpha), FunctionData(input=["p2", "n1"], output=True, alpha=alpha), FunctionData(input=["p2", "n2"], output=True, alpha=alpha), FunctionData(input=["p2", "p1"], output=False, alpha=alpha), FunctionData(input=["p2", "p2"], output=False, alpha=alpha), FunctionData(input=["n1", "n1"], output=False, alpha=alpha), FunctionData(input=["n1", "n2"], output=False, alpha=alpha), FunctionData(input=["n1", "p1"], output=True, alpha=alpha), FunctionData(input=["n1", "p2"], output=True, alpha=alpha), FunctionData(input=["n2", "n1"], output=False, alpha=alpha), FunctionData(input=["n2", "n2"], output=False, alpha=alpha), FunctionData(input=["n2", "p1"], output=True, alpha=alpha), FunctionData(input=["n2", "p2"], output=True, alpha=alpha) ] * n
import re import os from collections import defaultdict from LOTlib.DataAndObjects import FunctionData, Obj CONCEPT_DIR="Concepts" concept2data = defaultdict(list) for pth in os.listdir(CONCEPT_DIR): if not re.search(r"L[34]", pth): # skip these! with open(CONCEPT_DIR+"/"+pth, 'r') as f: description = f.next() # the first line of the file for l in f: parts = re.split(r"\t", l.strip()) # parse the true/false output = [ x == "#t" for x in re.findall("\#[tf]", parts[0])] # parse the set input = [] for theobj in parts[1:]: x = re.split(r",", theobj) # split within obj via commas input.append( Obj(shape=x[0], color=x[1], size=int(x[2])) ) concept2data[pth].append( FunctionData(input=input, output=output) )
#for i in xrange(100): #print grammar.generate() # Or we can make them as hypotheses (functions of S): #for i in xrange(100): #print LOTHypothesis(grammar, args=['S']) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Or real inference: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.DataAndObjects import FunctionData, Obj # for nicely managing data from LOTlib.Inference.MetropolisHastings import mh_sample # for running MCMC # Make up some data -- here just one set containing {red, red, green} colors data = [ FunctionData(input=[ {Obj(color='red'), Obj(color='red'), Obj(color='green')} ], \ output=True) ] # Create an initial hypothesis h0 = LOTHypothesis(grammar, args=['S']) # OR if we want to specify and use insert/delete proposals #from LOTlib.Proposals import * #h0 = LOTHypothesis(grammar, proposal_function=MixtureProposal(grammar, [RegenerationProposal(grammar), InsertDeleteProposal(grammar)] ) ) if __name__ == "__main__": # MCMC! for h in mh_sample(h0, data, 4000): # run sampler #for h in unique(mh_sample(h0, data, 4000)): # get unique samples # hypotheses' .prior, .likelihood, and .posterior_score are set in mh_sample print h.likelihood, h.prior, h.posterior_score, h
# ======================================================================================================== # Process command line arguments # ======================================================================================================== (options, args) = parser.parse_args() suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime()) prefix = '../out/simulations/skewed/' # ======================================================================================================== # Running # ======================================================================================================== language = AnBn() show_info('running skewed input case..') rec = probe_MHsampler(make_hypothesis('AnBn'), language, options, prefix + 'skewed_out_' + str(rank) + suffix) show_info('running normal input case..') CASE += 1 cnt = Counter() num = 64.0 * 2 / options.FINITE for i in xrange(1, options.FINITE / 2 + 1): cnt['a' * i + 'b' * i] = num rec1 = probe_MHsampler(make_hypothesis('AnBn'), language, options, prefix + 'normal_out' + str(rank) + suffix, data=[FunctionData(input=[], output=cnt)])
def make_data(n): return [FunctionData(input=[], output={val : n for val in DATA_STRINGS}, alpha=0.999)]
grammar.add_rule('COLOR', q('mauve'), None, 1.0) grammar.add_rule('SHAPE', q('square'), None, 1.0) grammar.add_rule('SHAPE', q('circle'), None, 1.0) grammar.add_rule('SHAPE', q('triangle'), None, 1.0) grammar.add_rule('SHAPE', q('diamond'), None, 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Make up some data # Let's give data from a simple conjunction (note this example data is not exhaustive) from LOTlib.DataAndObjects import FunctionData, Obj # FunctionData takes a list of arguments and a return value. The arguments are objects (which are handled correctly automatically # by is_color_ and is_shape_ data = [ FunctionData( [Obj(shape='square', color='red')], True), \ FunctionData( [Obj(shape='square', color='blue')], False), \ FunctionData( [Obj(shape='triangle', color='blue')], False), \ FunctionData( [Obj(shape='triangle', color='red')], False), \ ] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Other standard exports from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis def make_h0(value=None): return RationalRulesLOTHypothesis(grammar=DNF, value=value, rrAlpha=1.0)
from Run import * from LOTlib.Grammar import Grammar from LOTlib.DataAndObjects import FunctionData # -------------------------------------------------------------------------------------------------------- # Mixture model if __name__ == "__main__": path = os.getcwd() interval_data = [ FunctionData(input=[16], output={ 99: (30, 5), 64: (5, 30) }) ] math_data = [FunctionData(input=[16], output={99: (5, 30), 64: (30, 5)})] # run(grammar=mix_grammar, mixture_model=1, data=math_data, # ngh='enum7', domain=100, alpha=0.9, # iters=120000, skip=120, cap=1000, # print_stuff='', pickle_file='out/mix_math_120k.p', # csv_file=path+'/out/mix_math_120k') grammar_n = 10000 skip = 10 cap = grammar_n / skip hypotheses = []
""" if __name__ == '__main__': comm = MPI.COMM_WORLD rank = comm.Get_rank() # ======================================================================================================== # Process command line arguments # ======================================================================================================== (options, args) = parser.parse_args() suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime()) prefix = '../out/simulations/skewed/' # ======================================================================================================== # Running # ======================================================================================================== language = AnBn() show_info('running skewed input case..') rec = probe_MHsampler(make_hypothesis('AnBn'), language, options, prefix + 'skewed_out_' + str(rank) + suffix) show_info('running normal input case..') CASE += 1 cnt = Counter() num = 64.0 * 2 / options.FINITE for i in xrange(1, options.FINITE/2+1): cnt['a'*i+'b'*i] = num rec1 = probe_MHsampler(make_hypothesis('AnBn'), language, options, prefix + 'normal_out' + str(rank) + suffix, data=[FunctionData(input=[], output=cnt)])
# Define a grammar object # Defaultly this has a start symbol called 'START' but we want to call # it 'EXPR' grammar = Grammar(start='EXPR') # Define some operations grammar.add_rule('EXPR', '(%s + %s)', ['EXPR', 'EXPR'], 1.0) grammar.add_rule('EXPR', '(%s * %s)', ['EXPR', 'EXPR'], 1.0) grammar.add_rule('EXPR', '(float(%s) / float(%s))', ['EXPR', 'EXPR'], 1.0) grammar.add_rule('EXPR', '(-%s)', ['EXPR'], 1.0) # And define some numbers. We'll give them a 1/n^2 probability for n in xrange(1, 10): grammar.add_rule('EXPR', str(n), None, 10.0 / n**2) data = [FunctionData(input=[6], output=12, alpha=0.95)] #h = MyHypothesis() #print h.compute_prior(), h.compute_likelihood(data), h # define a "starting hypothesis". This one is essentially copied by # all proposers, so the sampler doesn't need to know its type or anything. h0 = MyHypothesis() from collections import Counter count = Counter() for h in MHSampler(h0, data, steps=10000): count[h] += 1 #for h in sorted(count.keys(), key=lambda x: count[x]): # print count[h], h.posterior_score, h
# BASE-SET is here a set of BASE-OBJECTS (non-args) grammar.add_rule('BASE-SET', 'set_add_', ['BASE-OBJECT', 'BASE-SET'], 1.0) grammar.add_rule('BASE-SET', 'set_', [], 1.0) grammar.add_rule('BASE-OBJECT', qq('p1'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('p2'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('n1'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('n2'), None, 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set up data -- true output means attraction (p=positive; n=negative) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data = [ FunctionData(input=["p1", "n1"], output=True), FunctionData(input=["p1", "n2"], output=True), FunctionData(input=["p1", "p1"], output=False), FunctionData(input=["p1", "p2"], output=False), FunctionData(input=["p2", "n1"], output=True), FunctionData(input=["p2", "n2"], output=True), FunctionData(input=["p2", "p1"], output=False), FunctionData(input=["p2", "p2"], output=False), FunctionData(input=["n1", "n1"], output=False), FunctionData(input=["n1", "n2"], output=False), FunctionData(input=["n1", "p1"], output=True), FunctionData(input=["n1", "p2"], output=True), FunctionData(input=["n2", "n1"], output=False), FunctionData(input=["n2", "n2"], output=False), FunctionData(input=["n2", "p1"], output=True), FunctionData(input=["n2", "p2"], output=True)
def make_data(size=options.datasize): return [FunctionData(input=[], output={'h e s': size, 'm e s': size, 'm e g': size, 'h e g': size, 'm e n': size, 'h e m': size, 'm e k': size, 'k e s': size, 'h e k': size, 'k e N': size, 'k e g': size, 'h e n': size, 'm e N': size, 'k e n': size, 'h e N': size, 'f e N': size, 'g e N': size, 'n e N': size, 'n e s': size, 'f e n': size, 'g e n': size, 'g e m': size, 'f e m': size, 'g e k': size, 'f e k': size, 'f e g': size, 'f e s': size, 'n e g': size, 'k e m': size, 'n e m': size, 'g e s': size, 'n e k': size})]
def parse_nonadjacent(_dir, temperature): """ 1. read raw hypos 2. get fixed llcnts 3. compute posterior given different data pool sizes NOTE: if _dir is previously dumped topn then load it """ if 'nonadjacent_topn' not in _dir: topn = set() for filename in os.listdir(_dir): if 'nonadjacent' in filename and 'seq' not in filename: print 'load', filename _set = load(open(_dir + filename)) topn.update([h for h in _set]) topn = list(topn) # fix the llcnts to save time and make curve smooth print 'get llcnts...' topn = gen_fixlen_llcnts(topn, 5) dump(topn, open(_dir + '_nonadjacent_topn' + suffix, 'w')) else: print 'load', _dir topn = load(open(_dir)) # find all correct hypotheses topn = list(topn) correct_set = set() for i in xrange(len(topn)): flag = True for k, v in topn[i].fixed_ll_counts.iteritems(): if len(k) < 2: continue elif k[0] == 'a' and k[-1] in 'b': continue elif k[0] == 'c' and k[-1] in 'bd': continue elif k[0] == 'e' and k[-1] in 'bdf': continue flag = False break if flag: correct_set.add(i) print len(correct_set), 'of', len(topn), 'are correct' # get posterior w_list = range(2, 25, 1) amount_list = range(24, 144, 5) posterior_seq = [] for i in xrange(len(w_list)): pool_size = w_list[i] language = LongDependency(max_length=pool_size) eval_data = [ FunctionData(input=[], output={ e: float(amount_list[i]) / pool_size for e in language.str_sets }) ] for h in topn: h.likelihood_temperature = temperature h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in topn]) prob = 0 for i in xrange(len(topn)): if i in correct_set: prob += np.exp(topn[i].posterior_score - Z) print 'pool_size', pool_size, 'prob', prob posterior_seq.append([pool_size, prob]) #debug _list = [h for h in topn] _list.sort(key=lambda x: x.posterior_score, reverse=True) for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), print h.fixed_ll_counts print _list[i] print '=' * 50 fff() dump(posterior_seq, open('nonadjacent_posterior_seq' + suffix, 'w'))
female(michelle). parent(michelle, sasha). parent(michelle, malia). parent(barak, sasha). parent(barak, malia). female(sasha). female(malia). parent(baraksr, barak). parent(ann, barak). parent(hussein, baraksr). parent(akumu, baraksr). """ data = [FunctionData(input=["grandparent(baraksr, QUERY)"], output="sahsa", alpha=0.99), FunctionData(input=["grandparent(baraksr, QUERY)"], output="malia", alpha=0.99), FunctionData(input=["grandparent(ann, QUERY)"], output="sahsa", alpha=0.99), FunctionData(input=["grandparent(ann, QUERY)"], output="malia", alpha=0.99), FunctionData(input=["grandparent(hussein, QUERY)"], output="barak", alpha=0.99), FunctionData(input=["grandparent(akumu, QUERY)"], output="barak", alpha=0.99) ] def make_hypothesis(**kwargs): return PrologHypothesis(base_facts=BASE_FACTS, **kwargs) def make_data(n=1): return data*n # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def make_pos(jump, temp): """ 1. read raw output 2. compute precision & recall on nonadjacent and adjacent contents 3. evaluate posterior probability on different data sizes 4. dump the sequence run: mpiexec -n 4 """ print 'loading..' fff() rec = load_hypo('out/simulations/nonadjacent/', ['0']) print 'estimating pr' fff() pr_dict = {} _set = set() cnt_tmp = {} for e in rec: for h in e[1]: if h in _set: continue cnt = Counter([h() for _ in xrange(256)]) # cnt = Counter([h() for _ in xrange(10)]) cnt_tmp[h] = cnt base = sum(cnt.values()) num = 0 for k, v in cnt.iteritems(): if k is None or len(k) < 2: continue if k[0] == 'a' and k[-1] == 'b': num += v pr_dict[h] = float(num) / base _set.add(h) work_list = range(2, 24, jump) space_seq = [] for i in work_list: language = LongDependency(max_length=i) eval_data = {} for e in language.str_sets: eval_data[e] = 144.0 / len(language.str_sets) eval_data = [FunctionData(input=[], output=eval_data)] prob_dict = {} ada_dict = {} test_list = [] for h in _set: h.likelihood_temperature = temp prob_dict[h] = h.compute_posterior(eval_data) p, r = language.estimate_precision_and_recall(h, cnt_tmp[h]) ada_dict[h] = 2 * p * r / (p + r) if p + r != 0 else 0 test_list.append([ h.posterior_score, ada_dict[h], pr_dict[h], cnt_tmp[h], str(h) ]) Z = logsumexp([h.posterior_score for h in _set]) test_list.sort(key=lambda x: x[0], reverse=True) weighted_x = 0 weighted_axb = 0 for e in test_list: weighted_x += np.exp(e[0] - Z) * e[1] weighted_axb += np.exp(e[0] - Z) * e[2] f = open('non_w' + suffix, 'a') print >> f, weighted_x, weighted_axb f.close() # print rank, i, '='*50 # for i_t in xrange(3): # print 'prob: ', np.exp(test_list[i_t][0] - Z), 'x_f-score', test_list[i_t][1], 'axb_f-score', test_list[i_t][2] # print test_list[i_t][3] # print test_list[i_t][5].compute_posterior(eval_data) # print language.estimate_precision_and_recall(test_list[i_t][5], cnt_tmp[test_list[i_t][5]]) # fff() # dump(test_list, open('test_list_'+str(rank)+'_'+str(i)+suffix, 'w')) # space_seq.append([prob_dict, ada_dict]) print 'rank', rank, i, 'done' fff() dump([space_seq, pr_dict], open('non_seq' + str(rank) + suffix, 'w'))
def runparts(size, x, p): #problem: right now only recording last partition, never saving from others. print "Start: " + str(x) + " on this many: " + str(size) try: #make new TopN for each data amount topn = TopN(N=200, key="posterior_score") print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals # We can do this with generate v = grammar.generate(copy(p)) h0 = MyHypothesis(grammar, value=v) data = [ FunctionData(input=[], output={ 'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size }) ] for h in break_ctrlc( MHSampler(h0, data, steps=options.steps, trace=False)): # print "\t", h.posterior_score, h topn.add(h) return size, set(topn) except Exception as e: print "*** Exception ignored: ", e #if we fail, we can return a blank TopN return size, set()
def parse_nonadjacent(temperature): """ load the hypothesis space and compute weighted F-scores of nonadjacent dependency on different pool sizes. replace the make_pos function example script: mpiexec -n 12 python parse_hypothesis.py --mode=nonadjacent_mk --temp=100 """ eval_data_size = 1024 global size global rank pr_dict = {} _set = set() if rank == 0: print 'loading..' fff() rec = load_hypo('out/simulations/nonadjacent/', ['_']) print 'estimating pr' fff() for e in rec: for h in e[1]: if h in _set: continue cnt = Counter([h() for _ in xrange(eval_data_size)]) num = 0 for k, v in cnt.iteritems(): if k is None or len(k) < 2: continue if k[0] + k[-1] in ['ab', 'cd', 'ef']: num += v pr_dict[h] = float(num) / eval_data_size _set.add(h) #debug _list = [[h, pr] for h, pr in pr_dict.iteritems()] _list.sort(key=lambda x: x[1], reverse=True) for i in xrange(10): print 'p,r: ', _list[i][1], print Counter([_list[i][0]() for _ in xrange(256)]) print _list[i][0] print '=' * 50 fff() print "sync..." fff() pr_dict = comm.bcast(pr_dict, root=0) _set = comm.bcast(_set, root=0) # work_list = slice_list(np.arange(2, 65, 2), size) work_list = slice_list(np.arange(10, 66, 5), size) seq = [] for s in work_list[rank]: wfs = 0.0 language = LongDependency(max_length=s) eval_data = [ FunctionData(input=[], output={ e: float(eval_data_size) / s for e in language.str_sets }) ] for h in _set: h.likelihood_temperature = temperature h.compute_posterior(eval_data) Z = logsumexp([h.posterior_score for h in _set]) seq.append([ s, sum([pr_dict[h] * np.exp(h.posterior_score - Z) for h in _set]) ]) #debug _list = [h for h in _set] _list.sort(key=lambda x: x.posterior_score, reverse=True) print 'pool size: ', s for i in xrange(3): print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', pr_dict[_list[i]], print Counter([_list[i]() for _ in xrange(256)]) print _list[i] print '=' * 50 fff() if rank == 0: for i in xrange(1, size): seq += comm.recv(source=i) else: comm.send(seq, dest=0) sys.exit(0) seq.sort(key=lambda x: x[0]) f = open('nonadjacent_wfs_seq' + suffix, 'w') for s, wfs in seq: print >> f, s, wfs f.close()
def make_data(size=options.datasize): return [ FunctionData(input=[], output={ 'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size }) ]
def make_data(n=1, alpha=0.999): return [FunctionData(input=[Obj(shape='square', color='red')], output=True, alpha=alpha), FunctionData(input=[Obj(shape='square', color='blue')], output=False, alpha=alpha), FunctionData(input=[Obj(shape='triangle', color='blue')], output=False, alpha=alpha), FunctionData(input=[Obj(shape='triangle', color='red')], output=False, alpha=alpha)]*n
ALPHA = 0.001 STEPS = 1000 N_H = 20 seqs_trans = {} c1 = vanilla_conditions(True, False)[0:2] c2 = vanilla_conditions(False, True)[0:1] for to_seq in c1: for from_seq in c2: print_star("") print from_seq, to_seq data = [ FunctionData(alpha=ALPHA, input=[from_seq], output={to_seq: len(to_seq)}) ] h0 = MyHypothesis() step = 0 tn = TopN(N=N_H) # Stream from the sampler to a printer for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=5.): tn.add(h) print for h in tn.get_all(sorted=True):
def uniform_data(size, max_length=None): cnt = Counter() num = size * 2 / max_length for i in xrange(1, max_length / 2 + 1): cnt['a' * i + 'b' * i] = num return [FunctionData(input=[], output=cnt)]
elif fn.name == 'question_': return '(%s)?' % to_regex(fn.args[0]) elif fn.name == 'or_': return '(%s|%s)' % tuple(map(to_regex, fn.args)) elif fn.name == 'str_append_': return '%s%s' % (fn.args[0], to_regex(fn.args[1])) elif fn.name == 'terminal_': return '%s' % fn.args[0] elif fn.name == '': return to_regex(fn.args[0]) else: assert False, fn ########################################################## # Define some data data = [ FunctionData(input=['aaaa'], output=True),\ FunctionData(input=['aaab'], output=False),\ FunctionData(input=['aabb'], output=False),\ FunctionData(input=['aaba'], output=False),\ FunctionData(input=['aca'], output=True),\ FunctionData(input=['aaca'], output=True),\ FunctionData(input=['a'], output=True) ] ########################################################## # make_h0 def make_h0(value=None): return RegexHypothesis(grammar, value=value, ALPHA=0.999)