def makeZipfianLexiconData(lexicon, word, context, n=100, s=1.0, alpha=0.9, verbose=False): # TODO remove word param from Shift files data = [] true_set = lexicon.make_true_data(context) all_poss_speakers = [ t[1] for t in true_set ] p = [ zipf(t, s, context, len(context.objects)) for t in all_poss_speakers ] for i in xrange(n): if flip(alpha): speaker = weighted_sample(all_poss_speakers, probs=p) bagR = {w : lexicon(w, context, set([speaker])) for w in lexicon.all_words()} uniqR = [] for w in lexicon.all_words(): uniqR.extend(bagR[w]) p1 = [ zipf(t, s, context, len(context.objects)) for t in uniqR ] referent = weighted_sample(uniqR, probs=p1) word = sample1([w for w in lexicon.all_words() if referent in bagR[w]]) if verbose: print "True data:", i, word, speaker, referent data.append(KinshipData(word, speaker, referent, context)) else: word = sample1(lexicon.all_words()) x = sample1(context.objects) y = sample1(context.objects) if verbose: print "Noise data:", i, word, x, y data.append(KinshipData(word, x, y, context)) if verbose: print lexicon.compute_likelihood(data) return data
def makeVariableLexiconData(lexicon, word, context, n=100, s=1.0, alpha=0.9, verbose=False): data = [] true_set = lexicon.make_true_data(context) all_poss_speakers = [t[1] for t in true_set] p = [zipf(t, s, context, len(context.objects)) for t in all_poss_speakers] for i in xrange(n): if flip(alpha): speaker = weighted_sample(all_poss_speakers, probs=p) referents = lexicon(word, context, set([speaker])) p1 = [zipf(t, s, context, len(context.objects)) for t in referents] referent = weighted_sample(referents, probs=p1) if verbose: print "True data:", i, word, speaker, referent data.append(KinshipData(word, speaker, referent, context)) else: x = sample1(context.objects) y = sample1(context.objects) if verbose: print "Noise data:", i, word, x, y data.append(KinshipData(word, x, y, context)) if verbose: print lexicon.compute_likelihood(data) return data
def genetic_algorithm(make_hypothesis, data, mutate, crossover, population_size=100, generations=100000): population = [make_hypothesis() for _ in xrange(population_size)] for h in population: h.compute_posterior(data) for g in xrange(generations): nextpopulation = [] while len(nextpopulation) < population_size: # sample proportional to fitness mom = weighted_sample(population, probs=[v.posterior_score for v in population], log=True) dad = weighted_sample(population, probs=[v.posterior_score for v in population], log=True) try: kid = mutate(crossover(mom, dad)) except (ProposalFailedException, NodeSamplingException): continue kid.compute_posterior(data) yield kid nextpopulation.append(kid) # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0): # if kid.posterior_score > population[i].posterior_score: # population[i] = kid # yield kid population = nextpopulation
def sample_utterance(self, possible_utterances, context): t, f, others = self.partition_utterances( possible_utterances, context) m = set(t).union(f) if flip(self.palpha) and (len(m) > 0): # if we sample from a presup is true if (flip(self.alpha) and (len(t)>0)): return weighted_sample(t, probs=map( lambda u: self.weightfunction(u, context), t), log=False) else: return weighted_sample(m, probs=map( lambda u: self.weightfunction(u, context), m), log=False) else: return weighted_sample(possible_utterances, probs=map( lambda u: self.weightfunction(u, context), possible_utterances), log=False) # sample from all utterances
def propose(current_state, bag=lexicon, probs=L): mod = len(current_state.all_words()) proposal = copy(current_state) proposal.value[words[propose.inx % mod]].value = weighted_sample(bag[words[propose.inx % mod]], probs=probs[words[propose.inx % mod]], log=True).value propose.inx += 1 return proposal
def sample_data(self, n): """ Return a dictionary of {string:count} that is a sample from this language """ return weighted_sample(self.str_sets, N=n, probs=self.string_log_probability, log=True)
def sample_sets_of_objects(N, objs): """ Makes a set of size N appropriate to using "set" functions on -- this means it must contain copies, not duplicate references """ s = weighted_sample(objs, N=N, returnlist=True) # the set of objects return map( deepcopy, s ) # the set must NOT be just the pointers sampled, since then set() operations will collapse them!
def propose(current_state, bag=lexicon, probs=L): mod = len(current_state.all_words()) proposal = copy(current_state) proposal.value[words[propose.inx % mod]].value = weighted_sample( bag[words[propose.inx % mod]], probs=probs[words[propose.inx % mod]], log=True).value propose.inx += 1 return proposal
def generate_data(data_size): """ Sample some data according to the target """ data = [] for i in range(data_size): # how many in this set set_size = weighted_sample( range(1,10+1), probs=[7187, 1484, 593, 334, 297, 165, 151, 86, 105, 112] ) # get the objects in the current set s = set(sample_sets_of_objects(set_size, all_objects)) # sample according to the target if random() < ALPHA: r = WORDS[len(s)-1] else: r = weighted_sample( WORDS ) # and append the sampled utterance data.append(FunctionData(input=[s], output=r)) # convert to "FunctionData" and store return data
def distance_based_proposer(x): y, lp = weighted_sample(proposal_to[x, :], probs=proposal_probs[x, :], Z=proposal_Z[x], return_probability=True, log=False) bp = lp + log(proposal_Z[x]) - log( proposal_Z[y] ) # the distance d is the same, but the normalizer differs return y, lp - bp
def make_data(data_size=300, alpha=0.75): """ Sample some data according to the target """ data = [] for i in range(data_size): # how many in this set set_size = weighted_sample( range(1, 10 + 1), probs=[7187, 1484, 593, 334, 297, 165, 151, 86, 105, 112]) # get the objects in the current set s = set(sample_sets_of_objects(set_size, all_objects)) # sample according to the target if random() < alpha: r = WORDS[len(s) - 1] else: r = weighted_sample(WORDS) # and append the sampled utterance data.append(FunctionData(input=[s], output=r, alpha=alpha)) return data
def genetic_algorithm(make_hypothesis, data, mutate, crossover, population_size=100, generations=100000): population = [make_hypothesis() for _ in xrange(population_size)] for h in population: h.compute_posterior(data) for g in xrange(generations): nextpopulation = [] while len(nextpopulation) < population_size: # sample proportional to fitness mom = weighted_sample( population, probs=[v.posterior_score for v in population], log=True) dad = weighted_sample( population, probs=[v.posterior_score for v in population], log=True) try: kid = mutate(crossover(mom, dad)) except (ProposalFailedException, NodeSamplingException): continue kid.compute_posterior(data) yield kid nextpopulation.append(kid) # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0): # if kid.posterior_score > population[i].posterior_score: # population[i] = kid # yield kid population = nextpopulation
def makeVariableLexiconData(lexicon, word, context, n=100, s=1.0, alpha=0.9, verbose=False): data = [] true_set = lexicon.make_true_data(context) all_poss_speakers = [ t[1] for t in true_set ] p = [ zipf(t, s, context, len(context.objects)) for t in all_poss_speakers ] for i in xrange(n): if flip(alpha): speaker = weighted_sample(all_poss_speakers, probs=p) referents = lexicon(word, context, set([speaker])) p1 = [ zipf(t, s, context, len(context.objects)) for t in referents ] referent = weighted_sample(referents, probs=p1) if verbose: print "True data:", i, word, speaker, referent data.append(KinshipData(word, speaker, referent, context)) else: x = sample1(context.objects) y = sample1(context.objects) if verbose: print "Noise data:", i, word, x, y data.append(KinshipData(word, x, y, context)) if verbose: print lexicon.compute_likelihood(data) return data
def propose(self): """ Default proposal to a lexicon -- now at least one, plus some coin flips :return: """ new = copy(self) ## Now we just copy the whole thing # Propose one for sure w = weighted_sample(self.value.keys()) # the word to change p, fb = self.value[w].propose() new.set_word(w, p) for x in self.all_words(): if w != x and flip(self.propose_p): xp, xfb = self.value[x].propose() new.set_word(x, xp) fb += xfb return new, fb
def sample_output(self, datum): # return a sample of my output given the input in datum if random() < datum.alpha: return self(*datum.input) else: return weighted_sample(WORDS) # uniform sample
def propose_tree(self, t): # Default regeneration proposal with some probability if random() >= self.insert_delete_probability: return self.my_regeneration_proposal.propose_tree(t) newt = copy(t) fb = 0.0 # the forward/backward prob we return sampled=False # so we can see if we didn't do it if random() < 0.5: # So we insert # first sample a node (through sample_node_via_iterate, which handles everything well) for ni, di, resample_p, resample_Z in self.grammar.sample_node_via_iterate(newt): if ni.args is None: continue # Can't deal with these TODO: CHECK THIS? # Since it's an insert, see if there is a (replicating) rule that expands # from ni.returntype to some ni.returntype replicating_rules = filter(lambda x: x.name != 'lambda' and (x.to is not None) and any([a==ni.returntype for a in x.to]), self.grammar.rules[ni.returntype]) # If there are none, then we can't insert! if len(replicating_rules) == 0: continue # choose a replicating rule; NOTE: this is done uniformly in this step, for simplicity r, gp = weighted_sample(replicating_rules, probs=lambda x: x.p, return_probability=True, log=False) gp = log(r.p) - sum([x.p for x in self.grammar.rules[ni.returntype]]) # this is the probability overall in the grammar, not my prob of sampling # Now take the rule and expand the children: # choose who gets to be ni nrhs = len( [ x for x in r.to if x == ni.returntype] ) # how many on the rhs are there? if nrhs == 0: continue replace_i = randint(0,nrhs-1) # choose the one to replace ## Now expand args but only for the one we don't sample... args = [] for x in r.to: if x == ni.returntype: if replace_i == 0: args.append( copy(ni) ) # if it's the one we replace into else: args.append( self.grammar.generate(x, d=di+1) ) #else generate like normalized replace_i -= 1 else: args.append( self.grammar.generate(x, d=di+1) ) #else generate like normal # Now we must count the multiple ways we could go forward or back after_same_children = [ x for x in args if x==ni] # how many are the same after? #backward_resample_p = sum([ x.resample_p for x in after_same_children]) # if you go back, you can choose any identical kids # create the new node sampled = True ni.setto( FunctionNode(returntype=r.nt, name=r.name, args=args, generation_probability=gp, bv_name=None, bv_args=None, ruleid=r.rid, resample_p=r.resample_p ) ) if sampled: new_lp_below = sum(map(lambda z: z.log_probability(), filter(isFunctionNode, args))) - ni.log_probability() newZ = self.grammar.resample_normalizer(newt) # To sample forward: choose the node ni, choose the replicating rule, choose which "to" to expand (we could have put it on any of the replicating rules that are identical), and genreate the rest of the tree f = (log(resample_p) - log(resample_Z)) + -log(len(replicating_rules)) + (log(len(after_same_children))-log(nrhs)) + new_lp_below # To go backwards, choose the inserted rule, and any of the identical children, out of all replicators b = (log(ni.resample_p) - log(newZ)) + (log(len(after_same_children)) - log(nrhs)) fb = f-b else: # A delete move! for ni, di, resample_p, resample_Z in self.grammar.sample_node_via_iterate(newt): if ni.name == 'lambda': continue # can't do anything if ni.args is None: continue # Can't deal with these TODO: CHECK THIS? # Figure out which of my children have the same type as me replicating_kid_indices = [ i for i in xrange(len(ni.args)) if isFunctionNode(ni.args[i]) and ni.args[i].returntype==ni.returntype] nrk = len(replicating_kid_indices) # how many replicating kids if nrk == 0: continue # if no replicating rules here ## We need to compute a few things for the backwards probability replicating_rules = filter(lambda x: (x.to is not None) and any([a==ni.returntype for a in x.to]), self.grammar.rules[ni.returntype]) if len(replicating_rules) == 0: continue i = sample1(replicating_kid_indices) # who to promote; NOTE: not done via any weighting # Now we must count the multiple ways we could go forward or back # Here, we could have sampled any of them equivalent to ni.args[i] before_same_children = [ x for x in ni.args if x==ni.args[i] ] # how many are the same after? # the lp of everything we'd have to create going backwards old_lp_below = sum(map(lambda z: z.log_probability(), filter(isFunctionNode, ni.args) )) - ni.args[i].log_probability() # and replace it sampled = True ni.setto( copy(ni.args[i]) ) # TODO: copy not necessary here, I think? if sampled: newZ = self.grammar.resample_normalizer(newt) # To go forward, choose the node, and then from all equivalent children f = (log(resample_p) - log(resample_Z)) + (log(len(before_same_children)) - log(nrk)) # To go back, choose the node, choose the replicating rule, choose where to put it, and generate the rest of the tree b = (log(ni.resample_p) - log(newZ)) + -log(len(replicating_rules)) + (log(len(before_same_children)) - log(nrk)) + old_lp_below fb = f-b # and fix the bound variables, whose depths may have changed if sampled: newt.fix_bound_variables() return [newt, fb]
NYes = [0] * (DATASET_SIZE * NDATASETS) #number of yes/no responses for each NNo = [0] * (DATASET_SIZE * NDATASETS) di = 0 for datasi, data in enumerate(datas): print "# Simulating data for ", datasi for i in xrange(len(data)): # update the posterior for h in hypotheses: h.compute_posterior([data[j] for j in xrange(i)]) probs = [x.posterior_score for x in hypotheses] # sample (if this is the hypothesis) for person in break_ctrlc(xrange(NPEOPLE)): h = weighted_sample(hypotheses, probs=probs, log=True) if random() < ALPHA: r = h(*data[i].input) # and use it to respond to the next one else: r = random() < BETA if r: NYes[di] += 1 else: NNo[di] += 1 di += 1 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Take into account the likelihoods in our inference # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
NYes = [0] * (DATASET_SIZE*NDATASETS) #number of yes/no responses for each NNo = [0] * (DATASET_SIZE*NDATASETS) di = 0 for datasi, data in enumerate(datas): print "# Simulating data for ", datasi for i in xrange(len(data)): # update the posterior for h in hypotheses: h.compute_posterior( [data[j] for j in xrange(i)]) probs = [x.posterior_score for x in hypotheses] # sample (if this is the hypothesis) for person in break_ctrlc(xrange(NPEOPLE)): h = weighted_sample(hypotheses, probs=probs, log=True) if random() < ALPHA: r = h(*data[i].input) # and use it to respond to the next one else: r = random() < BETA if r: NYes[di] += 1 else: NNo[di] += 1 di += 1 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Take into account the likelihoods in our inference # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def propose_tree(self, t): p = weighted_sample(self.proposals, probs=self.probs, log=False) return p.propose_tree(t)
def propose_tree(self,grammar,tree,resampleProbability=lambdaOne): """ sample a sub-proposer and propose from it """ chosen_proposer = weighted_sample(self.proposers, probs=self.proposer_weights) return chosen_proposer.propose_tree(grammar,tree,resampleProbability)
def sample_string(self): return weighted_sample(self.strings, probs=lambda s: pow(2.0, -len(s)) ) # sample inversely with length, ok?
def sample_string(self): # fix that this is not CF return weighted_sample(self.strings, probs=self.probs)
def propose_tree(self, t): # Default regeneration proposal with some probability if random() >= self.insert_delete_probability: return self.my_regeneration_proposal.propose_tree(t) newt = copy(t) fb = 0.0 # the forward/backward prob we return sampled = False # so we can see if we didn't do it if random() < 0.5: # So we insert # first sample a node (through sample_node_via_iterate, which handles everything well) for ni, di, resample_p, resample_Z in self.grammar.sample_node_via_iterate( newt): if ni.args is None: continue # Can't deal with these TODO: CHECK THIS? # Since it's an insert, see if there is a (replicating) rule that expands # from ni.returntype to some ni.returntype replicating_rules = filter( lambda x: x.name != 'lambda' and (x.to is not None) and any([a == ni.returntype for a in x.to]), self.grammar.rules[ni.returntype]) # If there are none, then we can't insert! if len(replicating_rules) == 0: continue # choose a replicating rule; NOTE: this is done uniformly in this step, for simplicity r, gp = weighted_sample(replicating_rules, probs=lambda x: x.p, return_probability=True, log=False) gp = log(r.p) - sum( [x.p for x in self.grammar.rules[ni.returntype]] ) # this is the probability overall in the grammar, not my prob of sampling # Now take the rule and expand the children: # choose who gets to be ni nrhs = len([x for x in r.to if x == ni.returntype ]) # how many on the rhs are there? if nrhs == 0: continue replace_i = randint(0, nrhs - 1) # choose the one to replace ## Now expand args but only for the one we don't sample... args = [] for x in r.to: if x == ni.returntype: if replace_i == 0: args.append( copy(ni)) # if it's the one we replace into else: args.append(self.grammar.generate( x, d=di + 1)) #else generate like normalized replace_i -= 1 else: args.append(self.grammar.generate( x, d=di + 1)) #else generate like normal # Now we must count the multiple ways we could go forward or back after_same_children = [x for x in args if x == ni ] # how many are the same after? #backward_resample_p = sum([ x.resample_p for x in after_same_children]) # if you go back, you can choose any identical kids # create the new node sampled = True ni.setto( FunctionNode(returntype=r.nt, name=r.name, args=args, generation_probability=gp, bv_name=None, bv_args=None, ruleid=r.rid, resample_p=r.resample_p)) if sampled: new_lp_below = sum( map(lambda z: z.log_probability(), filter(isFunctionNode, args))) - ni.log_probability() newZ = self.grammar.resample_normalizer(newt) # To sample forward: choose the node ni, choose the replicating rule, choose which "to" to expand (we could have put it on any of the replicating rules that are identical), and genreate the rest of the tree f = (log(resample_p) - log(resample_Z)) + -log(len(replicating_rules)) + (log( len(after_same_children)) - log(nrhs)) + new_lp_below # To go backwards, choose the inserted rule, and any of the identical children, out of all replicators b = (log(ni.resample_p) - log(newZ)) + (log(len(after_same_children)) - log(nrhs)) fb = f - b else: # A delete move! for ni, di, resample_p, resample_Z in self.grammar.sample_node_via_iterate( newt): if ni.name == 'lambda': continue # can't do anything if ni.args is None: continue # Can't deal with these TODO: CHECK THIS? # Figure out which of my children have the same type as me replicating_kid_indices = [ i for i in xrange(len(ni.args)) if isFunctionNode(ni.args[i]) and ni.args[i].returntype == ni.returntype ] nrk = len(replicating_kid_indices) # how many replicating kids if nrk == 0: continue # if no replicating rules here ## We need to compute a few things for the backwards probability replicating_rules = filter( lambda x: (x.to is not None) and any( [a == ni.returntype for a in x.to]), self.grammar.rules[ni.returntype]) if len(replicating_rules) == 0: continue i = sample1( replicating_kid_indices ) # who to promote; NOTE: not done via any weighting # Now we must count the multiple ways we could go forward or back # Here, we could have sampled any of them equivalent to ni.args[i] before_same_children = [x for x in ni.args if x == ni.args[i] ] # how many are the same after? # the lp of everything we'd have to create going backwards old_lp_below = sum( map(lambda z: z.log_probability(), filter(isFunctionNode, ni.args))) - ni.args[i].log_probability() # and replace it sampled = True ni.setto(copy( ni.args[i])) # TODO: copy not necessary here, I think? if sampled: newZ = self.grammar.resample_normalizer(newt) # To go forward, choose the node, and then from all equivalent children f = (log(resample_p) - log(resample_Z)) + ( log(len(before_same_children)) - log(nrk)) # To go back, choose the node, choose the replicating rule, choose where to put it, and generate the rest of the tree b = (log(ni.resample_p) - log(newZ)) + -log(len(replicating_rules)) + (log( len(before_same_children)) - log(nrk)) + old_lp_below fb = f - b # and fix the bound variables, whose depths may have changed if sampled: newt.fix_bound_variables() return [newt, fb]
def propose_tree(self, grammar, tree, resampleProbability=lambdaOne): """ sample a sub-proposer and propose from it """ chosen_proposer = weighted_sample(self.proposers, probs=self.proposer_weights) return chosen_proposer.propose_tree(grammar, tree, resampleProbability)
def sample_sets_of_objects(N, objs): """ Makes a set of size N appropriate to using "set" functions on -- this means it must contain copies, not duplicate references """ s = weighted_sample(objs, N=N, returnlist=True) # the set of objects return map(deepcopy, s) # the set must NOT be just the pointers sampled, since then set() operations will collapse them!
def makeZipfianLexiconData(lexicon, context, dfreq=None, n=100, s=1.0, alpha=0.9, epsilon=0.8, verbose=False): ''' L() --> P(W) [ eps P(S|W) P(R|W) + 1-eps P(S|W) P(R|SW)] P(W) ~ dfreq or defaults to uniform P(S|W) ~ Zipf(s) domain: all speakers that can use that word P(R|W) ~ Zipf(s) domain: all people the learner has a word for P(R|SW) ~ Zipf(s) domain: all referents the speaker can use the word to refer to :param lexicon: the target lexicon :param context: the context :param dfreq: dictionary[word] = frequency weight (float) :param n: the number of data points :param s: the zipfian exponent parameter :param alpha: the reliability parameter. Noise = 1 - alpha :param epsilon: the ego-centric probability :param verbose: print the generated data points :return: list of KinshipData objects ''' assert context.distance is not None, "There are no distances in the context!" if dfreq is not None: assert set(lexicon.all_words()).issubset(set( dfreq.keys())), "Words in lexicon without frequencies" freq = lambda w: dfreq[w] else: freq = None data = [] speakers = dict() egoRef = dict() for w in lexicon.all_words(): speakers[w] = [t[1] for t in lexicon.make_word_data(w, context)] egoRef[w] = [ t[2] for t in lexicon.make_word_data(w, context, fixX=context.ego) ] for i in xrange(n): if flip(alpha): wrd = weighted_sample(lexicon.all_words(), probs=freq) speaker = weighted_sample( speakers[wrd], probs=lambda x: zipf(x, s, context, len(context.objects))) if flip(epsilon): referent = weighted_sample( egoRef[wrd], probs=lambda x: zipf(x, s, context, len(context.objects))) eps = 'Ego' else: referent = weighted_sample( lexicon(wrd, context, set([speaker])), probs=lambda x: zipf(x, s, context, len(context.objects))) eps = 'Speaker' if verbose: print "True data:", i, wrd, speaker, referent, eps data.append(KinshipData(wrd, speaker, referent, context)) else: wrd = weighted_sample(lexicon.all_words(), probs=freq) x = weighted_sample( context.objects, probs=lambda x: zipf(x, s, context, len(context.objects))) y = weighted_sample( context.objects, probs=lambda x: zipf(x, s, context, len(context.objects))) if verbose: print "Noise data:", i, wrd, x, y data.append(KinshipData(wrd, x, y, context)) if verbose: print lexicon.compute_likelihood(data) return data