def probe_MHsampler(h, language, options, name, size=64, data=None, init_size=None, iters_per_stage=None, sampler=None, ret_sampler=False): get_data = language.sample_data_as_FuncData evaluation_data = get_data(size, max_length=options.FINITE) if data is None: if init_size is None: data = evaluation_data else: data = get_data(n=size, max_length=init_size) if sampler is None: sampler = MHSampler(h, data) else: sampler.data = data best_hypotheses = TopN(N=options.TOP_COUNT) iter = 0 for h in sampler: if iter == options.STEPS: break if iter % 100 == 0: print '---->', iter best_hypotheses.add(h) if iter % options.PROBE == 0: for h in best_hypotheses: h.compute_posterior(evaluation_data) Z = logsumexp([h.posterior_score for h in best_hypotheses]) pr_data = get_data(1024, max_length=options.FINITE) weighted_score = 0 for h in best_hypotheses: precision, recall = language.estimate_precision_and_recall( h, pr_data) if precision + recall != 0: f_score = precision * recall / (precision + recall) weighted_score += np.exp(h.posterior_score - Z) * f_score weighted_score *= 2 to_file([[iter, Z, weighted_score]], name) if init_size is not None and iter % iters_per_stage == 0: init_size += 2 sampler.data = get_data(n=size, max_length=init_size) iter += 1 if ret_sampler: return sampler
def __init__(self, h0, data, prior_schedule=None, likelihood_schedule=None, **kwargs): MHSampler.__init__(self, h0, data, **kwargs) if prior_schedule is None: prior_schedule = ConstantSchedule(1.0) if likelihood_schedule is None: likelihood_schedule = ConstantSchedule(1.0) self.prior_schedule = prior_schedule self.likelihood_schedule = likelihood_schedule
def probe_MHsampler(h, language, options, name, size=64, data=None, init_size=None, iters_per_stage=None, sampler=None, ret_sampler=False): get_data = language.sample_data_as_FuncData evaluation_data = get_data(size, max_length=options.FINITE) if data is None: if init_size is None: data = evaluation_data else: data = get_data(n=size, max_length=init_size) if sampler is None: sampler = MHSampler(h, data) else: sampler.data = data best_hypotheses = TopN(N=options.TOP_COUNT) iter = 0 for h in sampler: if iter == options.STEPS: break if iter % 100 == 0: print '---->', iter best_hypotheses.add(h) if iter % options.PROBE == 0: for h in best_hypotheses: h.compute_posterior(evaluation_data) Z = logsumexp([h.posterior_score for h in best_hypotheses]) pr_data = get_data(1024, max_length=options.FINITE) weighted_score = 0 for h in best_hypotheses: precision, recall = language.estimate_precision_and_recall(h, pr_data) if precision + recall != 0: f_score = precision * recall / (precision + recall) weighted_score += np.exp(h.posterior_score - Z) * f_score weighted_score *= 2 to_file([[iter, Z, weighted_score]], name) if init_size is not None and iter % iters_per_stage == 0: init_size += 2 sampler.data = get_data(n=size, max_length=init_size) iter += 1 if ret_sampler: return sampler
def mpirun(d): """ Generate NumberGameHypotheses using MPI. """ if options.grammar_scale: grammar_ = grammar_gamma(grammar, options.grammar_scale) else: grammar_ = grammar h0 = NumberGameHypothesis(grammar=grammar_, domain=100, alpha=0.9) mh_sampler = MHSampler(h0, d.input, options.iters) # hypotheses = TopN(N=options.N) hypotheses = set() # This is a dict so we don't add duplicate hypotheses sets, e.g. h1() == [4], h2() == [4] h_sets = {} for h in break_ctrlc(mh_sampler): h_set = str(h()) if h_set in h_sets: if h.prior > h_sets[h_set].prior: hypotheses.remove(h_sets[h_set]) h_sets[h_set] = h hypotheses.add(h) else: h_sets[h_set] = h hypotheses.add(h) top1000 = sorted(hypotheses, key=lambda h: -h.posterior_score)[0:1000] return top1000
def construct_hypothesis_space(data_size): all_hypotheses = TopN() print 'Data size: ', data_size for i in range(RUNS): print 'Run: ', i hypotheses = TopN(25) data = generate_data(data_size) learner = GriceanQuantifierLexicon(make_my_hypothesis, my_weight_function) for w in target.all_words(): learner.set_word(w, make_my_hypothesis()) j = 0 for h in MHSampler(learner, data, SAMPLES, skip=0): hypotheses.add(h) j += 1 if j > 0 and j % 1000 == 0: pickle.dump( hypotheses, open( 'data/hypset_' + GRAMMAR_TYPE + '_' + str(data_size) + '_' + str(j) + '.pickle', 'w')) #sstr = str(h) #sstr = re.sub("[_ ]", "", sstr) #sstr = re.sub("presup", u"\u03BB A B . presup", sstr) #print sstr all_hypotheses.update(hypotheses) return all_hypotheses
def some_stuff(): #simulate some data with high probability of #center embedding #and a lower probability of tail recursion lst = ["(", "[", "]", ")"] randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")] d = simulateData(lst, randomTstLists, p=1.0, N=12) for k in d.keys(): if d[k] > 0.0: print k, d[k] print len(d) #print isValidCenterEmbed(("(", "]")) #print isValidCenterEmbed(('(', '(', ')', ']')) data = [ FunctionData(input=(), output=d, alpha=0.9) ] h0 = MyHypothesis() from numpy import exp for h in MHSampler(h0, data, steps=5000): None #exp(h.compute_likelihood(data)) > 0.0 or y = h() #print h, y if isValidCenterEmbed(y): #print "hello" #None print exp(h.compute_posterior(data)), h, y#, isValidCenterEmbed(y) #else: # print h.compute_likelihood(data), h, y
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word( 0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True) print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def get_top_N(pair1, pair2): priors = {} complete = 0 for p1 in pair1: for p2 in pair2: data = [ FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)}) ] h0 = MyHypothesis() top_hyps = set() seen = set() chains = 0 while ((len(seen) < n_top and chains < max_chains) or (len(seen) < 3)): chains += 1 x = 0 for h in MHSampler(h0, data, steps=steps, acceptance_temperature=acc_temp): #print y out = h(p1)[:len(p2)] str_h = str(h) if len(out) == len(p2) and hamming_distance(out, p2) == 0: if str_h not in seen: #and "from" not in str_h[14:]: top_hyps.add((copy.deepcopy(h), h.prior)) seen.add(str_h) if x % 1000 == 0: print_star(x, h, out, p2, h.value.get_rule_signature(), len(seen)) x += 1 print_star() priors[(p1, p2)] = [] for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]: print p1, p2 print h[0], h[1], h[0].value.count_subnodes() priors[(p1, p2)].append( (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes())) complete += 1 print "complete: %d" % complete for key in priors: print "***" print key for p in priors[key]: print p print "***" return priors
def run(): h0 = make_hypothesis() data = make_data() for x in break_ctrlc(MHSampler(h0, data, STEPS)): print x.posterior_score, x for di in data: print "\t", di.input, "->", x( *di.input), " ; should be ", di.output
def run_one(iteration, model, model2data, sampler_type): """ Run one iteration of a sampling method """ if LOTlib.SIG_INTERRUPTED: return # Take model and load the function to create hypotheses # Data is passed in to be constant across runs if re.search(r":", model): m, d = re.split(r":", model) make_hypothesis, _ = load_example(m) else: make_hypothesis, _ = load_example(model) h0 = make_hypothesis() grammar = h0.grammar data = model2data[model] # Create a sampler if sampler_type == 'mh_sample_A': sampler = MHSampler(h0, data, options.SAMPLES, likelihood_temperature=1.0) # elif sampler_type == 'mh_sample_B': sampler = MHSampler(h0, data, options.SAMPLES, likelihood_temperature=1.1) # elif sampler_type == 'mh_sample_C': sampler = MHSampler(h0, data, options.SAMPLES, likelihood_temperature=1.25) # elif sampler_type == 'mh_sample_D': sampler = MHSampler(h0, data, options.SAMPLES, likelihood_temperature=2.0 ) # elif sampler_type == 'mh_sample_E': sampler = MHSampler(h0, data, options.SAMPLES, likelihood_temperature=5.0 ) elif sampler_type == 'particle_swarm_A': sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=10) elif sampler_type == 'particle_swarm_B': sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=100) elif sampler_type == 'particle_swarm_C': sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=200) elif sampler_type == 'particle_swarm_prior_sample_A': sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=10) elif sampler_type == 'particle_swarm_prior_sample_B': sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=100) elif sampler_type == 'particle_swarm_prior_sample_C': sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=200) elif sampler_type == 'multiple_chains_A': sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=10) elif sampler_type == 'multiple_chains_B': sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=100) elif sampler_type == 'multiple_chains_C': sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=1000) elif sampler_type == 'parallel_tempering_A': sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 1.025, 1.05], swaps=1, yield_only_t0=False) elif sampler_type == 'parallel_tempering_B': sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 1.25, 1.5], swaps=1, yield_only_t0=False) elif sampler_type == 'parallel_tempering_C': sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 2.0, 5.0], swaps=1, yield_only_t0=False) elif sampler_type == 'taboo_A': sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.001) elif sampler_type == 'taboo_B': sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.010) elif sampler_type == 'taboo_C': sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.100) elif sampler_type == 'taboo_D': sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 1.000) elif sampler_type == 'taboo_E': sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty=10.000) # elif sampler_type == 'partitionMCMC_A': sampler = PartitionMCMC(grammar, make_hypothesis, data, 10, steps=options.SAMPLES) # elif sampler_type == 'partitionMCMC_B': sampler = PartitionMCMC(grammar, make_hypothesis, data, 100, steps=options.SAMPLES) # elif sampler_type == 'partitionMCMC_C': sampler = PartitionMCMC(grammar, make_hypothesis, data, 1000, steps=options.SAMPLES) elif sampler_type == 'enumeration_A': sampler = EnumerationInference(grammar, make_hypothesis, data, steps=options.SAMPLES) else: assert False, "Bad sampler type: %s" % sampler_type # And open our output and evaluate with open("output/out-aggregate.%s" % get_rank(), 'a') as out_aggregate: evaluate_sampler(sampler, trace=False, prefix="\t".join(map(str, [model, iteration, sampler_type])), out_aggregate=out_aggregate, print_every=options.PRINTEVERY)
def runme(chain, dataamt): if LOTlib.SIG_INTERRUPTED: return () data = make_data(dataamt) tn = TopN(options.top) h0 = make_hypothesis() for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, skip=0)): # print h.posterior_score, h.prior, h.likelihood, h h.likelihood_per_data = h.likelihood/dataamt tn.add(h) return tn
def run(data, TOP=100, STEPS=1000): #if LOTlib.SIG_INTERRUPTED: # return "" #data = [FunctionData(input=(), output={lst: len(lst)})] h0 = MyHypothesis() tn = TopN(N=TOP) # run the sampler counter = Counter() for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=1.0, likelihood_temperature=1.0):#, likelihood_temperature=10.0): # counter[h] += 1 tn.add(h) z = logsumexp([h.posterior_score for h in tn]) sort_post_probs = [(h, exp(h.posterior_score - z)) for h in tn.get_all(sorted=True)][::-1] return sort_post_probs
def myrun(observed_set): if LOTlib.SIG_INTERRUPTED: return set() h0 = NumberGameHypothesis(grammar=grammar) data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)] tn = TopN(N=options.TOP_COUNT) for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print "# Finished %s" % str(observed_set) return set(tn.get_all())
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) sampler = MHSampler(h0, data, **kwargs) # # TODO change acceptance temperature over times # sampler.acceptance_temperature = 0.5 for i, h in enumerate(break_ctrlc(sampler)): # if i % 10000 == 0 and i != 0: # sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1) # print '='*50 # print 'change acc temperature to', sampler.acceptance_temperature best_hypotheses.add(h) if show and i%(show_skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
def run(save_file, alpha, iters, propose_scale, propose_n, skip, summary_cap): # Faux data data = [ HumanData( data=FunctionData(input=[2,4,6,8], output=[]), queries=(1, 20, 30, 48, 80, 99), responses=((1, 19), (17, 3), (15, 5), (19, 1), (20, 0), (2, 18)) ), HumanData( data=FunctionData(input=[10, 40], output=[]), queries=(1, 20, 30, 48, 80, 99), responses=((1, 19), (20, 0), (20, 0), (2, 18), (19, 1), (2, 18)) ) ] # Enumerate all 'domain level' hypotheses generated by our grammar hypotheses = [] for fn in simple_grammar.enumerate(d=10): h = NumberGameHypothesis(grammar=simple_grammar, domain=100, alpha=alpha) h.set_value(fn) h.compute_prior() hypotheses.append(h) grammar_h0 = GrammarHypothesisVectorized(simple_grammar, hypotheses, propose_scale=propose_scale, propose_n=propose_n) mh_grammar_sampler = MHSampler(grammar_h0, data, iters) mh_grammar_summary = VectorSummary(skip=skip, cap=summary_cap) print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60 # Initialize csv file mh_grammar_summary.csv_initfiles(save_file) # Sample GrammarHypotheses! for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)): if (i % 10 == 0): print i, " ITERATIONS" print '\n', '#'*100 # Save to CSV & print grammar rule values if (i % skip == 0): mh_grammar_summary.csv_appendfiles(save_file, data) for idx in grammar_h0.get_propose_idxs(): print idx, '\t| ', grammar_h0.rules[idx] mh_grammar_summary.pickle_summary(filename=save_file + '_summary.p')
def get_top_N(pair1, pair2): priors = {} for p1 in pair1: for p2 in pair2: data = [ FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)}) ] h0 = MyHypothesis() top_hyps = set() seen = set() x = 0 while len(top_hyps) < n_top * 2: for h in MHSampler(h0, data, steps=steps): #print y out = h(p1)[:len(p2)] str_h = str(h) if len(out) == len(p2) and hamming_distance(out, p2) == 0: if str_h not in seen: top_hyps.add((copy.deepcopy(h), h.prior)) seen.add(str_h) if x % 1000 == 0: print p1, p2 print_star(x, h, out, p2, h.value.get_rule_signature()) x += 1 print_star() priors[(p1, p2)] = [] for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]: print p1, p2 print h[0], h[1], h[0].value.count_subnodes() priors[(p1, p2)].append( (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes())) for key in priors: print "***" print key for p in priors[key]: print p print "***" return priors
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print h.posterior_score, h print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE # print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") print "# Starting on ", h0 tn = TopN(N=options.TOP_COUNT) # print h0.compute_posterior(data) # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True): # print h.posterior_score, h # print getattr(h, 'll_counts', None) with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i%options.SKIP == 0 and h.posterior_score > -Infinity: print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata print >>ofile, getattr(h,'ll_counts', None) print >>ofile, h, '\0' # must add \0 when not Lexicon return tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") tn = TopN(N=options.TOP_COUNT) for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): print h.posterior_score, h print getattr(h, 'll_counts', None) # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: # # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # tn.add(h) # # print h.posterior_score, getattr(h, 'll_counts', None), h # if i%options.SKIP == 0: # print >>ofile, "\n" # print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata # print >>ofile, getattr(h,'ll_counts', None), # print >>ofile, h # ends in \0 so we can sort with sort -g -z return tn
def runTest(self): for model in [ 'EvenOdd', 'FOL', 'Magnetism.Simple', 'Magnetism.Complex', 'NAND', 'Number', 'RegularExpression', 'RationalRules', 'StochasticGrammarInduction', 'SymbolicRegression.Galileo', 'SymbolicRegression.Symbolic', 'Prolog', 'PureLambda', 'Lua' ]: print "# Testing loading of example", model make_hypothesis, make_data = load_example(model) d = make_data() d = make_data(10) # require an amount # Let's just try initializing a bunch of times for _ in xrange(100): h0 = make_hypothesis() # and ensure that the samplign will run for _ in MHSampler(h0, d, steps=100): pass
def run(data_size, my_finite_trees): data = generate_data(data_size) # the prior for each tree prior = np.array([x.compute_prior() for x in my_finite_trees]) prior = prior - logsumexp(prior) # the likelihood weights for each hypothesis weights = np.array([my_weight_function(h) for h in my_finite_trees]) # response[h,di] gives the response of the h'th tree to data di response = np.array( [mapto012(get_tree_set_responses(t, data)) for t in my_finite_trees]) # Now actually run: hypset = TopN(N=TOP_COUNT) learner = VectorizedLexicon_DistanceMetricProposal(target.all_words(), my_finite_trees, prior) databundle = [response, weights] generator = MHSampler(learner, databundle, STEPS, skip=SKIP) for g in generator: hypset.add(VectorizedLexicon_to_SimpleLexicon(g), g.posterior_score) return hypset
h0.start_counts = start_counts #for h in grammar.get_rule #print h0.__dict__.get('rrAlpha', 1.0) h0.set_value(value=h) h0.compute_prior() h0.compute_likelihood(data) print s, h0.value, exp(h0.prior) #, h0.likelihood s += 1 #unit_tests() assert (False) stp = 0 t1 = time.time() best = None best_posterior = None for h in SampleStream(MHSampler(h0, data, steps=stps)): r = h() if best_posterior == None or h.posterior_score >= best_posterior: best = copy.deepcopy(h) best_out = r best_posterior = h.posterior_score if stp % 500 == 0: print stp, float(stp + 1) / (time.time() - t1) try: print hamming_distance(lst, r[:len(lst)]) except: print len(lst) print best print best_out
# print "#\t Loaded human data for concept %s" % concept print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run inference # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib import break_ctrlc from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output) mhs = MHSampler(h0, [], 100000, skip=0) for s, h in break_ctrlc(enumerate(mhs)): print mhs.acceptance_ratio(), h.prior, h.likelihood,\ h.value['alpha'].value[0], h.value['beta'].value[0],\ h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\ 'RULES',\ ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['SET'].value ]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run gradient ascent
def make_hypothesis(**kwargs): return MyHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Main # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import q # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z h0 = MyHypothesis(grammar, ll_decay=1.0, rrAlpha=1.0, args=['x']) data = make_data() # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc( MHSampler(h0, data, 10000, skip=100, shortcut_likelihood=False)): print h.posterior_score, h.prior, h.likelihood, q(h) # This setup requires the *later* data to be upweighted, meaning that hypotheses that get # later data wrong should be given lower likelhood. But also with the decay, the overall # magnitude of the likelihood decreases.
# print "#\t Loaded human data for concept %s" % concept print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run inference # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib import break_ctrlc from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output) mhs = MHSampler(h0, [], 100000, skip=0) for s, h in break_ctrlc(enumerate(mhs)): print mhs.acceptance_ratio(), h.prior, h.likelihood,\ h.value['alpha'].value[0], h.value['beta'].value[0],\ h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\ 'RULES',\ ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['SET'].value ]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run gradient ascent # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#output=stim)] h0 = MyHypothesis() MAP = None best_post = -float("inf") best_out = "" n_comp = n_compatible(stim, concepts) print stim, n_comp while n_compatible(stim, concepts) < 2: for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=2.): out = h(all_C) str_h = str(h) if out not in concepts: concepts[out] = [] if str_h not in seen: if len(concepts[out]) < 200 or np.exp(h.prior) > min( [x[0] for x in concepts[out]]): # if len(concepts[out]) > 0: # print len(concepts[out]), np.exp(h.prior), min([x[0] for x in concepts[out]]) concepts[out].append(
def run(pairs): priors = {} complete = 0 top_hyps = set() already_done = set() t_start = time.time() for pair in pairs: p1 = pair[0] p2 = pair[1] h0 = MyHypothesis() t_pair = time.time() #h0.start_counts = add_counts seen = set() #for ind in xrange(2, 3): for ind in xrange(len(p1) + 1): seen_round = set() x = 0 p1_i = p1[:ind] p2_i = p2[:ind] if (p1, p2_i) not in already_done: already_done.add((p1, p2_i)) data = [ FunctionData(alpha=alpha, input=[p1], output={p2_i: len(p2_i)}) ] while len(seen_round) < n_top: for h in MHSampler(h0, data, steps=steps, acceptance_temperature=acc_temp, prior_temperature=prior_temp): if len(seen_round) >= n_top: break str_h = str(h.value) out = h(p1)[:len(p2_i)] if (len(out) == len(p2_i) and (hamming_distance(out, p2_i) == 0) and (len(h(p1)[:len(p1)]) == len(p1))): if str_h not in seen: #and "from" not in str_h[14:]: l_rules = [ str(i) for i in list( numpy.hstack( get_rule_counts(grammar, h.value))) ] top_hyps.add( (toAB(p1), ind, copy.deepcopy(h), toAB(p2), toAB(h(p1_i)[:len(p1)]), ",".join(l_rules), str(h.value))) seen.add(str_h) if str_h not in seen_round: seen_round.add(str_h) if x % 1000 == 0: print_star( "seen:%d" % len(seen_round), "steps:%d" % x, "hyp:%s" % str_h, "p2:%s" % p2_i, "out:%s" % out, "prior:%f" % h.prior, "pair_time:%.2f" % (time.time() - t_pair), "tot_time:%.2f" % (time.time() - t_start)) x += 1 for h in top_hyps: print_star(h[0], h[1], h[2], h[3], h[4], h[5]) return top_hyps
# Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis def make_hypothesis(grammar=grammar, **kwargs): return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) if __name__ == "__main__": from LOTlib.TopN import TopN hyps = TopN(N=1000) from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib import break_ctrlc mhs = MHSampler(make_hypothesis(), make_data(), 1000000, likelihood_temperature=1., prior_temperature=1.) for samples_yielded, h in break_ctrlc(enumerate(mhs)): h.ll_decay = 0. hyps.add(h) import pickle with open('HypothesisSpace.pkl', 'w') as f: pickle.dump(hyps, f)
# Stash counts for viz with open('Viz/Counts_' + MODEL + '.csv', 'w') as f: f.writelines('\n'.join([','.join([str(r) for r in h0]) + ',' + ','.join([str(r) for r in h]) for h0, h in zip(counts['BOOL'], counts['PREDICATE'])])) print "# Computed counts for each hypothesis & nonterminal" from LOTlib.Inference.GrammarInference.SimpleGrammarHypothesis import SimpleGrammarHypothesis from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler h0 = SimpleGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output) # h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output) writ = [] mhs = MHSampler(h0, [], 100, skip=500) for s, h in break_ctrlc(enumerate(mhs)): if isinstance(h, SimpleGrammarHypothesis): a = str(mhs.acceptance_ratio()) + ',' + str(h.prior) + ',' + str(h.likelihood) + ',BOOLS,' +\ ','.join([str(x) for x in h.value['BOOL'].value ]) + ',PREDS,' + ','.join([str(x) for x in h.value['PREDICATE'].value ]) else: assert isinstance(h, FullGrammarHypothesis) a = str(mhs.acceptance_ratio()) + ',' + str(h.prior) + ',' + str(h.likelihood) + ',' + \ str(h.value['alpha'].value[0]) + ',' + str(h.value['beta'].value[0]) + ',' + \ str(h.value['prior_temperature']) + ',' + str(h.value['likelihood_temperature']) + ',RULES,' +\ ','.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]) print a writ.append(a)
def next(self): # Just set the temperatures by the schedules self.prior_temperature = self.prior_schedule.next() self.likelihood_temperature = self.likelihood_schedule.next() return MHSampler.next(self)
from LOTlib import break_ctrlc from LOTlib.TopN import TopN from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from Model import * from TargetConcepts import TargetConcepts NDATA = 20 # How many data points for each function? NSTEPS = 100000 BEST_N = 500 # How many from each hypothesis to store # Where we keep track of all hypotheses (across concepts) all_hypotheses = TopN(N=BEST_N) if __name__ == "__main__": # Now loop over each target concept and get a set of hypotheses for i, f in enumerate(TargetConcepts): # Set up the hypothesis h0 = make_hypothesis() # Set up some data data = make_data(NDATA, f) # Now run some MCMC fs = TopN(N=BEST_N, key="posterior_score") fs.add(break_ctrlc(MHSampler(h0, data, steps=NSTEPS, trace=False))) all_hypotheses.update(fs) pickle.dump(all_hypotheses, open("hypotheses.pkl", 'w'))
log(before_same_children) - log(nrk)) + old_lp_below return [newt, f - b] if __name__ == "__main__": from LOTlib import break_ctrlc #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data #data = generate_data(300) ## NOTE: TO NORMALLY USE THIS, YOU MUST MIX WITH REGENERATION PROPOSAL -- ELSE NOT ERGODIC from LOTlib.Examples.Magnetism.Simple.Run import grammar, make_h0, data from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler idp = InsertDeleteProposal(grammar) #data = generate_data(100) h = make_h0(proposal_function=idp) for h in break_ctrlc(MHSampler(h, data, 100000)): print h.posterior_score, h """ for _ in xrange(100): t = grammar.generate() print "\n\n", t for _ in xrange(10): print "\t", idp.propose_tree(t) """
grammar.start = 'TWO_CONCEPT_START' # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from Model import make_hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Main # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import q # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc. # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x" h0 = make_hypothesis(grammar=grammar, args=['concept', 'x']) data = make_data() # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)): print h.posterior_score, h.prior, h.likelihood, q(h)
def run(grammar=lot_grammar, mixture_model=0, data=toy_exp_3, iters=10000, skip=10, cap=100, print_stuff='sgr', ngh='out/ngh_100k', hypotheses=None, domain=100, alpha=0.9, save_file='', csv_freq=500, pickle_summary=False, pickle_gh=0): """ Enumerate some NumberGameHypotheses, then use these to sample some GrammarHypotheses over `data`. Arguments --------- grammar : LOTlib.Grammar This is our grammar. mixture_model : bool Are we using the MixtureGrammarHypothesis data : list List of FunctionData to use as input/output data. ngh : str Where is the file we save/load our ngh's to/from? iters : int Number of GrammarHypotheses to sample. skip : int Collect 1 gh sample every `skip` samples. cap : int VectorSummary will collect this many GrammarHypothesis samples. print_stuff : str What do we print? ['s' | 'g' | 'r'] save_file : str If we're pickling or saving csvs, this is the file name to save to. # csv_file : str # If saving to csv, this is the file name to save to (don't include .csv!). # csv_compare_model : int # Do we save model comparison (regression) plots as we iterate? These take ~15 minutes to save. """ # -------------------------------------------------------------------------------------------------------- if mixture_model: ParameterHypothesis = MixtureGrammarHypothesis else: ParameterHypothesis = NoConstGrammarHypothesis # -------------------------------------------------------------------------------------------------------- # Load NumberGameHypotheses if hypotheses is None: # In case we want to enumerate hypotheses instead of loading from file if 'enum' in ngh: hypotheses = [] for fn in grammar.enumerate(d=int(re.sub('[a-z]', '', ngh))): h = NumberGameHypothesis(grammar=grammar, domain=domain, alpha=alpha) h.set_value(fn) h.compute_prior() hypotheses.append(h) ngh += '.p' # Load NumberGameHypotheses else: f = open(ngh, "rb") hypotheses = pickle.load(f) for h in hypotheses: h.grammar = grammar # -------------------------------------------------------------------------------------------------------- # Fill VectorSummary grammar_h0 = ParameterHypothesis(grammar, hypotheses, ngh_file=ngh, propose_scale=.1, propose_n=1) mh_grammar_sampler = MHSampler(grammar_h0, data, iters) mh_grammar_summary = VectorSummary(skip=skip, cap=cap) # Print all GrammarRules in grammar with corresponding value index if 'r' in print_stuff: print '='*100, '\nGrammarRules:' for idx in grammar_h0.get_propose_idxs(): print idx, '\t| ', grammar_h0.rules[idx] if 's' in print_stuff: print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60 # Initialize csv file if save_file: mh_grammar_summary.csv_initfiles(save_file) # Sample GrammarHypotheses! for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)): if save_file and csv_freq and (i % csv_freq == 0): mh_grammar_summary.csv_appendfiles(save_file, data) # Save to N samples, where N=pickle_gh if pickle_gh and (i % pickle_gh == 0): mh_grammar_summary.pickle_MAPsample(save_file+'_map_'+str(i/pickle_gh)+'.p') mh_grammar_summary.pickle_cursample(save_file+'_cur_'+str(i/pickle_gh)+'.p') # Print every N/20 samples if 's' in print_stuff: if i % (iters/20) is 0: for idx in gh.get_propose_idxs(): print idx, '\t| ', gh.rules[idx], ' --> ', gh.value[idx] # print i, '-'*100, '\n', {idx:gh.value[idx] for idx in gh.get_propose_idxs()} print gh.prior, gh.likelihood, gh.posterior_score # Save summary & print top samples if pickle_summary: mh_grammar_summary.pickle_summary(filename=save_file+'_summary.p') if 'g' in print_stuff: mh_grammar_summary.print_top_samples()