def make_data(n=1, alpha=0.9, dataset=['A']): data = [] if 'A' in dataset: data.append([ FunctionData(input=[ Obj(shape='rhombus', color='cinnabar', size='miniature') ], output=False, alpha=alpha), FunctionData(input=[ Obj(shape='pentagon', color='viridian', size='colossal') ], output=True, alpha=alpha) ] * n) if 'B' in dataset: data.append([ FunctionData(input=[ Obj(shape='rhombus', color='cinnabar', size='miniature') ], output=False, alpha=alpha), FunctionData(input=[ Obj(shape='dodecahedron', color='cerulean', size='intermediate') ], output=True, alpha=alpha) ] * n) return data
def make_data(alpha=0.99, size=1): # here just doubling x :-> cons(x,x) return [ FunctionData( input=[[]], output=[[], []], alpha=alpha, ), FunctionData( input=[[[]]], output=[[[]], [[]]], alpha=alpha ) ] * size
def some_stuff(): #simulate some data with high probability of #center embedding #and a lower probability of tail recursion lst = ["(", "[", "]", ")"] randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")] d = simulateData(lst, randomTstLists, p=1.0, N=12) for k in d.keys(): if d[k] > 0.0: print k, d[k] print len(d) #print isValidCenterEmbed(("(", "]")) #print isValidCenterEmbed(('(', '(', ')', ']')) data = [ FunctionData(input=(), output=d, alpha=0.9) ] h0 = MyHypothesis() from numpy import exp for h in MHSampler(h0, data, steps=5000): None #exp(h.compute_likelihood(data)) > 0.0 or y = h() #print h, y if isValidCenterEmbed(y): #print "hello" #None print exp(h.compute_posterior(data)), h, y#, isValidCenterEmbed(y) #else: # print h.compute_likelihood(data), h, y
def get_kl_seq(): """ 1. read posterior sequences 2. compute KL-divergence between adjacent distribution 3. plot run: serial """ print 'loading..' fff() seq_set = [ load(open('seq0_0825_234606')), load(open('seq1_0825_234606')), load(open('seq2_0825_234606')) ] # seq_set = load(open('seq0_0825_195540')) + load(open('seq1_0825_195540')) + load(open('seq2_0825_195540')) kl_seq_set = [] print 'compute prior..' fff() # add posterior set that without observing any data from copy import deepcopy dict_0 = deepcopy(seq_set[0][0]) for h in dict_0: dict_0[h] = h.compute_posterior( [FunctionData(input=[], output=Counter())]) print 'making plot..' fff() for seq in seq_set: kl_seq = [] # # add posterior set that without observing any data # from copy import deepcopy # dict_0 = deepcopy(seq[0]) # for h in dict_0: # dict_0[h] = h.compute_posterior([FunctionData(input=[], output=Counter())]) seq.insert(0, dict_0) # compute kl for i in xrange(len(seq) - 1): current_dict = seq[i] next_dict = seq[i + 1] kl = compute_kl(current_dict, next_dict) kl_seq.append(log(kl)) print 'KL from %i to %i: ' % (i, i + 1), kl fff() kl_seq_set.append(kl_seq) print '=' * 50 fff() staged, = plt.plot(range(12, 145, 2), kl_seq_set[0], label='staged') normal, = plt.plot(range(12, 145, 2), kl_seq_set[1], label='normal') uniform, = plt.plot(range(12, 145, 2), kl_seq_set[2], label='uniform') plt.legend(handles=[normal, staged, uniform]) plt.ylabel('KL-divergence') plt.xlabel('data') plt.show()
def import_pd_data(fname): import pandas as pd from collections import defaultdict df = pd.read_pickle(fname) grouped = df.groupby(['concept', 'target'], as_index=False) data = defaultdict(lambda: FunctionData(input=[], output={})) for (c, t), group in grouped: y = sum(group['rating']) n = len(group['rating']) - y try: concept = list(eval(c)) except: concept = [eval(c)] target = eval(t) data[c].input = concept data[c].output[target] = (y, n) return data.values() # josh_data = import_josh_data()
def import_josh_data(path=None): """Script for loading Joshs' number game data. Data is originally in probability (i.e. float) format, so (# yes, # no) pairs are estimated by assuming 20 human participants. """ import os from scipy.io import loadmat if path is None: path = os.getcwd() mat = loadmat(path + '/number_game_data.mat') mat_data = mat['data'] number_game_data = [] for d in mat_data: input_data = d[0][0].tolist() output_data = {} for i in range(len(d[1][0])): key = d[1][0][i] associated_prob = d[2][0][i] associated_yes = int(associated_prob * 20) output_data[key] = (associated_yes, 20 - associated_yes ) # est. (# yes, # no) responses function_datum = FunctionData(input=input_data, output=output_data) number_game_data.append(function_datum) return number_game_data
def sample_data(self, n): # Sample a string of data cnt = Counter() for _ in xrange(n): cnt[self.sample_string()] += 1 return [FunctionData(input=[], output=cnt, alpha=self.ALPHA)]
def load_words_and_data(path): """ Takes a data path and return [words, data] """ # Load the data data = [] with open(path, 'r') as f: for l in f: if re.match(r"\s*#", l): continue # skip comments if not re.match(r"[^\s]", l): continue # skip whitespace lhs, output = re.split(r"\s*=\s*", l.strip()) args = re.split(r"\s+", lhs) data.append(FunctionData(input=args, output=output)) #print "Loading data", args, "->", output # Figure out all the words! (here, tokens) words = set() for di in data: words.add(di.output) [words.add(x) for x in di.input] words = list(words) return [words, data]
def run(save_file, alpha, iters, propose_scale, propose_n, skip, summary_cap): # Faux data data = [ HumanData( data=FunctionData(input=[2,4,6,8], output=[]), queries=(1, 20, 30, 48, 80, 99), responses=((1, 19), (17, 3), (15, 5), (19, 1), (20, 0), (2, 18)) ), HumanData( data=FunctionData(input=[10, 40], output=[]), queries=(1, 20, 30, 48, 80, 99), responses=((1, 19), (20, 0), (20, 0), (2, 18), (19, 1), (2, 18)) ) ] # Enumerate all 'domain level' hypotheses generated by our grammar hypotheses = [] for fn in simple_grammar.enumerate(d=10): h = NumberGameHypothesis(grammar=simple_grammar, domain=100, alpha=alpha) h.set_value(fn) h.compute_prior() hypotheses.append(h) grammar_h0 = GrammarHypothesisVectorized(simple_grammar, hypotheses, propose_scale=propose_scale, propose_n=propose_n) mh_grammar_sampler = MHSampler(grammar_h0, data, iters) mh_grammar_summary = VectorSummary(skip=skip, cap=summary_cap) print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60 # Initialize csv file mh_grammar_summary.csv_initfiles(save_file) # Sample GrammarHypotheses! for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)): if (i % 10 == 0): print i, " ITERATIONS" print '\n', '#'*100 # Save to CSV & print grammar rule values if (i % skip == 0): mh_grammar_summary.csv_appendfiles(save_file, data) for idx in grammar_h0.get_propose_idxs(): print idx, '\t| ', grammar_h0.rules[idx] mh_grammar_summary.pickle_summary(filename=save_file + '_summary.p')
def sample_data_as_FuncData(self, n, avg=True): """ n: can be float in avg mode finite: limits the max_length of data avg: sample for multiple times and average to reduce noise, note the cnt can have fraction """ if n == 0: return [FunctionData(input=[], output=Counter())] if avg: cnt = Counter(self.sample_data(int(n * 512))) n = float(512) for key in cnt.keys(): cnt[key] /= n return [FunctionData(input=[], output=cnt)] return [FunctionData(input=[], output=Counter(self.sample_data(n)))]
def sample_data(self, n): # Sample a string of data cnt = Counter() for _ in xrange(n): s = str(self.grammar.generate()) cnt[s] += 1 return [FunctionData(input=[0], output=cnt)]
def make_data(n=1): return [ FunctionData( input=[{Obj(color='red'), Obj(color='red'), Obj(color='green')}], output=True, alpha=0.99) ] * n
def make_data(size=10, alpha=0.99): # Replicate all data N times return [FunctionData(['A', Obj(shape='square', color='red')], True, alpha=alpha), FunctionData(['A', Obj(shape='square', color='blue')], False, alpha=alpha), FunctionData(['A', Obj(shape='triangle', color='blue')], False, alpha=alpha), FunctionData(['A', Obj(shape='triangle', color='red')], False, alpha=alpha), FunctionData(['B', Obj(shape='square', color='red')], False, alpha=alpha), FunctionData(['B', Obj(shape='square', color='blue')], True, alpha=alpha), FunctionData(['B', Obj(shape='triangle', color='blue')], True, alpha=alpha), FunctionData(['B', Obj(shape='triangle', color='red')], True, alpha=alpha)] * size
def make_data(size=50): return [ FunctionData(input=[], output={ 'N V': size, 'D N V': size, 'D N V N': size, 'D N V D N': size }) ]
def make_data(n=1, target=F1, data_size=100, sd=0.1): # initialize the data data = [] for i in range(data_size): x = random() y = target(x) + normal() * sd data.append(FunctionData(input=[x], output=y, ll_sd=sd)) return data * n
def get_top_N(pair1, pair2): priors = {} complete = 0 for p1 in pair1: for p2 in pair2: data = [ FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)}) ] h0 = MyHypothesis() top_hyps = set() seen = set() chains = 0 while ((len(seen) < n_top and chains < max_chains) or (len(seen) < 3)): chains += 1 x = 0 for h in MHSampler(h0, data, steps=steps, acceptance_temperature=acc_temp): #print y out = h(p1)[:len(p2)] str_h = str(h) if len(out) == len(p2) and hamming_distance(out, p2) == 0: if str_h not in seen: #and "from" not in str_h[14:]: top_hyps.add((copy.deepcopy(h), h.prior)) seen.add(str_h) if x % 1000 == 0: print_star(x, h, out, p2, h.value.get_rule_signature(), len(seen)) x += 1 print_star() priors[(p1, p2)] = [] for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]: print p1, p2 print h[0], h[1], h[0].value.count_subnodes() priors[(p1, p2)].append( (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes())) complete += 1 print "complete: %d" % complete for key in priors: print "***" print key for p in priors[key]: print p print "***" return priors
def make_data(data_size=1, alpha=0.95): data = [] randomWSList = random.sample(WSList, data_size) for i in range(len(randomWSList)): data.append(FunctionData(input=[randomWSList[i][0]], output=randomWSList[i][1], alpha=alpha)) return data # data = [ FunctionData(input=[WS0_initial], output=WS0_end, alpha=0.95)] # data_1 = [ FunctionData(input=[WS0_initial], output=WS0_end, alpha=0.95), FunctionData(input=[WS1_initial], output=WS1_end, alpha=0.95)]
def make_data(size=1, alpha=0.99): return [ FunctionData(input=['aaaa'], output=True, alpha=alpha), FunctionData(input=['aaab'], output=False, alpha=alpha), FunctionData(input=['aabb'], output=False, alpha=alpha), FunctionData(input=['aaba'], output=False, alpha=alpha), FunctionData(input=['aca'], output=True, alpha=alpha), FunctionData(input=['aaca'], output=True, alpha=alpha), FunctionData(input=['a'], output=True, alpha=alpha) ] * size
def make_data(N=30): """ The data here consist of saffran-aslin-newport type strings. They have a geometric length distribution more like what you might find in natural data, with more frequent shorter strings. This is modeled in the hypothesis with a flip to whether or not you recurse to generate a longer string. """ data = [] cnt = Counter() for _ in xrange(N): cnt[''.join(sample_one(words) for _ in xrange(5))] += 1 return [FunctionData(input=[], output=cnt)]
def treebank2FunctionData(strs): """ Parse treebank-style trees into FunctionNodes and return a list of FunctionData with the right format The data is in the following format: di.args = T di.output = [] where we the data function "output" is implicit in T (depending on where we choose pronouns) SO: All the fanciness is handled in the likelihood """ return map( lambda s: FunctionData(input=[list2FunctionNode(parseScheme(s))], output=None), strs)
def make_data(n=1): return [ FunctionData(input=[1000.], output=1500., ll_sd=data_sd), FunctionData(input=[828.], output=1340., ll_sd=data_sd), FunctionData(input=[800.], output=1328., ll_sd=data_sd), FunctionData(input=[600.], output=1172., ll_sd=data_sd), FunctionData(input=[300.], output=800., ll_sd=data_sd), FunctionData(input=[0.], output=0., ll_sd=data_sd) # added 0,0 since it makes physical sense. ] * n
def myrun(observed_set): if LOTlib.SIG_INTERRUPTED: return set() h0 = NumberGameHypothesis(grammar=grammar) data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)] tn = TopN(N=options.TOP_COUNT) for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print "# Finished %s" % str(observed_set) return set(tn.get_all())
def make_data(n=1): data = [] for _ in xrange(n): for a, b in itertools.product(OBJECTS, OBJECTS): myinput = [a, b] # opposites (n/p) interact; x interacts with nothing myoutput = (a[0] != b[0]) and (a[0] != 'x') and (b[0] != 'x') data.append( FunctionData(input=myinput, output=myoutput, alpha=0.99)) return data
def get_top_N(pair1, pair2): priors = {} for p1 in pair1: for p2 in pair2: data = [ FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)}) ] h0 = MyHypothesis() top_hyps = set() seen = set() x = 0 while len(top_hyps) < n_top * 2: for h in MHSampler(h0, data, steps=steps): #print y out = h(p1)[:len(p2)] str_h = str(h) if len(out) == len(p2) and hamming_distance(out, p2) == 0: if str_h not in seen: top_hyps.add((copy.deepcopy(h), h.prior)) seen.add(str_h) if x % 1000 == 0: print p1, p2 print_star(x, h, out, p2, h.value.get_rule_signature()) x += 1 print_star() priors[(p1, p2)] = [] for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]: print p1, p2 print h[0], h[1], h[0].value.count_subnodes() priors[(p1, p2)].append( (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes())) for key in priors: print "***" print key for p in priors[key]: print p print "***" return priors
def main(): lst = ["(", "[", "]", ")"] randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")] #d will simulate N participants who center embed correctly at rate pC, #tail embed at rate pT and do something else the rest of the time #FINISH THIS!!! d = simulateData(lst, randomTstLists, pC=0.4, pT=0.6, N=500) for k in d.keys(): print d, d[k] data = [ FunctionData(input=(), output=d, alpha=0.9) ] r = run(data, TOP=5, STEPS=5000) for i in r: print i print i[0](), i[0](), i[0]()
def csvToFunctionData(filename): with open(filename, mode='rb') as f: reader = csv.reader(f) rows = [row for row in reader] ins = defaultdict(list()) outs = defaultdict(list()) # Fill `ins` and `outs` dictionaries for row in rows: if row[1] is 'in': ins[row[0]].append(row[2]) if row[1] is 'out': outs[row[0]].append([row[2:]]) # Fill FunctionData objects data_keys = set([row[0] for row in rows]) data = {} for k in data_keys: data[k] = FunctionData(input=ins[k], output=outs[k]) return data
def make_data(data_size=300, alpha=0.75): """ Sample some data according to the target """ data = [] for i in range(data_size): # how many in this set set_size = weighted_sample( range(1, 10 + 1), probs=[7187, 1484, 593, 334, 297, 165, 151, 86, 105, 112]) # get the objects in the current set s = set(sample_sets_of_objects(set_size, all_objects)) # sample according to the target if random() < alpha: r = WORDS[len(s) - 1] else: r = weighted_sample(WORDS) # and append the sampled utterance data.append(FunctionData(input=[s], output=r, alpha=alpha)) return data
def make_data(size=datamt): return [ FunctionData(input=[], output={ 'h e s': size, 'm e s': size, 'm e g': size, 'h e g': size, 'm e n': size, 'h e m': size, 'm e k': size, 'k e s': size, 'h e k': size, 'k e N': size, 'k e g': size, 'h e n': size, 'm e N': size, 'k e n': size, 'h e N': size, 'f e N': size, 'g e N': size, 'n e N': size, 'n e s': size, 'f e n': size, 'g e n': size, 'g e m': size, 'f e m': size, 'g e k': size, 'f e k': size, 'f e g': size, 'f e s': size, 'n e g': size, 'k e m': size, 'n e m': size, 'g e s': size, 'n e k': size }) ]
def uniform_data(size, max_length=None): cnt = Counter() num = size * 2 / max_length for i in xrange(1, max_length/2+1): cnt['a'*i+'b'*i] = num return [FunctionData(input=[], output=cnt)]
def run(pairs): priors = {} complete = 0 top_hyps = set() already_done = set() t_start = time.time() for pair in pairs: p1 = pair[0] p2 = pair[1] h0 = MyHypothesis() t_pair = time.time() #h0.start_counts = add_counts seen = set() #for ind in xrange(2, 3): for ind in xrange(len(p1) + 1): seen_round = set() x = 0 p1_i = p1[:ind] p2_i = p2[:ind] if (p1, p2_i) not in already_done: already_done.add((p1, p2_i)) data = [ FunctionData(alpha=alpha, input=[p1], output={p2_i: len(p2_i)}) ] while len(seen_round) < n_top: for h in MHSampler(h0, data, steps=steps, acceptance_temperature=acc_temp, prior_temperature=prior_temp): if len(seen_round) >= n_top: break str_h = str(h.value) out = h(p1)[:len(p2_i)] if (len(out) == len(p2_i) and (hamming_distance(out, p2_i) == 0) and (len(h(p1)[:len(p1)]) == len(p1))): if str_h not in seen: #and "from" not in str_h[14:]: l_rules = [ str(i) for i in list( numpy.hstack( get_rule_counts(grammar, h.value))) ] top_hyps.add( (toAB(p1), ind, copy.deepcopy(h), toAB(p2), toAB(h(p1_i)[:len(p1)]), ",".join(l_rules), str(h.value))) seen.add(str_h) if str_h not in seen_round: seen_round.add(str_h) if x % 1000 == 0: print_star( "seen:%d" % len(seen_round), "steps:%d" % x, "hyp:%s" % str_h, "p2:%s" % p2_i, "out:%s" % out, "prior:%f" % h.prior, "pair_time:%.2f" % (time.time() - t_pair), "tot_time:%.2f" % (time.time() - t_start)) x += 1 for h in top_hyps: print_star(h[0], h[1], h[2], h[3], h[4], h[5]) return top_hyps