def sample(self,m,components=None): """ Samples m samples from the current finite mixture distribution. :param m: Number of samples to draw. :type m: int. :rtype: natter.DataModule.Data :returns: A Data object containing the samples """ dim = self['P'][0].sample(1).dim() nc = multinomial(m,self.param['alpha']) mrange = range(m) shuffle(mrange) X = zeros((dim,m)) ind = 0 K = len(self['P']) for k in xrange(K): dat = self.param['P'][k].sample(nc[k]) X[:,mrange[ind:ind + nc[k]]] = dat.X if components is not None: components[mrange[ind:ind + nc[k]]] = k ind += nc[k] return Data(X,"%i samples from a %i-dimensional finite mixture distribution" % (m,dim))
def _diff(self, dt: float): state_diff = {comp: 0 for comp in self.model.compartments} for (src, edges) in self.model.transition_rates.items(): if len(edges) == 1: [(dest, rate)] = edges flow = binomial(self.model[src], rate * dt) # remove flow from src component state_diff[src] -= flow # add flow to dest component state_diff[dest] += flow else: assert src == "E" # add the last compartment as the "catchall" i.e. the probability that they stay exposed. infectious_rates = np.array([rate for (_, rate) in edges] + [0]) flow = multinomial(self.model[src], infectious_rates * dt) state_diff[src] -= sum(flow[:-1]) for index, (dest, _) in enumerate(edges): state_diff[dest] += flow[index] return state_diff
def generate(num_seq, seq_length, alphabet, m_word_length, m_word_param, background_param): magic_thetas = [dirichlet(m_word_param) for j in range(m_word_length)] background_theta = dirichlet(background_param) sequences = [] starts = [] for k in range(num_seq): background_onehots = [multinomial(1, background_theta) for x in range(seq_length - m_word_length)] background = [alphabet[t] for t in [i.tolist().index(1) for i in background_onehots]] #background = [alphabet[t].lower() for t in [i.tolist().index(1) for i in background_onehots]] magic_onehots = [multinomial(1, theta) for theta in magic_thetas] magic_word = [alphabet[j] for j in [i.tolist().index(1) for i in magic_onehots]] start_pos = randint(seq_length - m_word_length) background[start_pos : start_pos] = magic_word sequences.append(background) starts.append(start_pos) #print starts ans = [] ans.append(starts) ans.append(sequences) return ans
def multinomial(trials, probs, shape=[]): """multinomial(trials, probs) or multinomial(trials, probs, [n, m, ...]) returns array of multinomial distributed integer vectors. trials is the number of trials in each multinomial distribution. probs is a one dimensional array. There are len(prob)+1 events. prob[i] is the probability of the i-th event, 0<=i<len(prob). The probability of event len(prob) is 1.-np.sum(prob). The first form returns a single 1-D array containing one multinomially distributed vector. The second form returns an array of shape (m, n, ..., len(probs)). In this case, output[i,j,...,:] is a 1-D array containing a multinomially distributed integer 1-D array.""" if shape == []: shape = None return mt.multinomial(trials, probs, shape)
def draw(x): p = x/float(x.sum()) f = 1.*multinomial(n,p) return f
def gibbssample(num_iters, pos_sequences, alphabet, m_word_length, m_word_param, background_param): #print sequences sequences = pos_sequences[1] M = len(alphabet) # Alphabet size K = len(sequences) # Num sequences N = len(sequences[0]) # Seq length alph_map = {alphabet[m] : m for m in range(M)} iter_logpost = [[] for x in range(2)] # Initialize hidden word starting locations R = randint(0, N - m_word_length, K).tolist() # Calculate sum of alphas for magic word and background distributions A = float(sum(m_word_param)) A_back = float(sum(background_param)) # Get magic word and background symbol counts N_m = [[0.0] * m_word_length for x in range(M)] bg_m = [0.0] * M for i in range(K): for x in range(N): if x >= R[i] and x < R[i] + m_word_length: N_m[alph_map[sequences[i][x].upper()]][x-R[i]] += 1 else: bg_m[alph_map[sequences[i][x].upper()]] += 1 # Begin iterations for l in range(num_iters): to_exclude = range(K) for s in range(K): # Select sequence to exclude exclude_indx = randint(len(to_exclude)) z = to_exclude[exclude_indx] del to_exclude[exclude_indx] # Update counts for excluding excluded sequence for x in range(N): if x >= R[z] and x < R[z] + m_word_length: N_m[alph_map[sequences[z][x].upper()]][x-R[z]] -= 1 else: bg_m[alph_map[sequences[z][x].upper()]] -= 1 P = [[0.0] * m_word_length for x in range(M)] P_bg = [0.0] * M # Calculate log conditional P(s_(z,j) = m | s_(j,-z), alpha) for each symbol and m_word pos for m in range(M): P_bg[m] = log((bg_m[m] + background_param[m]) / (A_back + K*(N-m_word_length) - 1)) for j in range(m_word_length): P[m][j] = log((N_m[m][j] + m_word_param[m]) / (A + K - 1)) # Calculate posterior over each starting position in s_z r_bg_z = sum([P_bg[alph_map[m.upper()]] for m in sequences[z]]) r = [r_bg_z] * (N - m_word_length) for s_pos in range(N - m_word_length): for j in range(m_word_length): idx = s_pos + j r[s_pos] -= P_bg[alph_map[sequences[z][idx].upper()]] r[s_pos] += P[alph_map[sequences[z][idx].upper()]][j] # Normalize conditionals probs = [exp(x) for x in r] normalizer = sum(probs) probs = [x/normalizer for x in probs] # Update starting position for s_z R[z] = multinomial(1,probs).tolist().index(1) # Update counts for updating starting position of excluded sequence for x in range(N): if x >= R[z] and x < R[z] + m_word_length: N_m[alph_map[sequences[z][x].upper()]][x-R[z]] += 1 else: bg_m[alph_map[sequences[z][x].upper()]] += 1 # Calculate posterior log_post = lgamma(A_back) - lgamma(K*(N - m_word_length) + A_back) for m in range(M): log_post += lgamma(bg_m[m] + background_param[m]) - lgamma(background_param[m]) for j in range(m_word_length): log_post += lgamma(A) - lgamma(K + A) for m in range(M): log_post += lgamma(N_m[m][j] + m_word_param[m]) - lgamma(m_word_param[m]) iter_logpost[0].append(l) iter_logpost[1].append(log_post) #print[abs(pos_sequences[0][i] - R[i]) < 1 for i in range(len(R))] #print R return [R,iter_logpost]
" ".join(["%.3g" % xx for xx in clusterPrior])) print(sys.stderr, "means:") for row in means: print(sys.stderr, " ".join(["%.3g" % xx for xx in row])) if vvar == 0: print(sys.stderr, "fixed variance %.3g" % sigma) else: print(sys.stderr, "variances:") for row in variances: print(sys.stderr, " ".join(["%.3g" % xx for xx in row])) print(sys.stderr, "noise variances:") print(sys.stderr, " ".join(["%.3g" % xx for xx in noiseVariances])) clusterSizes = multinomial(trainNum, clusterPrior) print (sys.stderr, "training cluster sizes:",\ " ".join([str(xx) for xx in clusterSizes])) for label in labels(trainNum, clusterSizes): print( label, " ".join([ str(xx) for xx in features(label, means, variances, noiseMeans, noiseVariances) ])) clusterSizes = multinomial(num, clusterPrior) print(clusterSizes) print(sys.stderr, "cluster sizes:",\ " ".join([str(xx) for xx in clusterSizes]))
" ".join(["%.3g" % xx for xx in clusterPrior])) print (sys.stderr, "means:") for row in means: print (sys.stderr, " ".join(["%.3g" % xx for xx in row])) if vvar == 0: print (sys.stderr, "fixed variance %.3g" % sigma) else: print (sys.stderr, "variances:") for row in variances: print (sys.stderr, " ".join(["%.3g" % xx for xx in row])) print (sys.stderr, "noise variances:") print (sys.stderr, " ".join(["%.3g" % xx for xx in noiseVariances])) clusterSizes = multinomial(trainNum, clusterPrior) print (sys.stderr, "training cluster sizes:",\ " ".join([str(xx) for xx in clusterSizes])) for label in labels(trainNum, clusterSizes): print (label, " ".join([str(xx) for xx in features(label, means, variances, noiseMeans, noiseVariances)])) clusterSizes = multinomial(num, clusterPrior) print(clusterSizes) print(sys.stderr, "cluster sizes:",\ " ".join([str(xx) for xx in clusterSizes])) for label in labels(num, clusterSizes):
# allProbs = np.exp(allProbs) return allProbs / np.sum(allProbs) # *** Gibbs sampling 200x # *** Gibbs sampling 200x for num_iter in range(200): print(num_iter) # update c logPi = np.log(pi) probs = x.map(lambda x_i: getProbs(False, log_mu, x_i, logPi)) # Now we need to asign and find out to which category goes each document c = probs.map(lambda prob: np.nonzero(multinomial(1, prob))[0][0] ) # *** c is the assignment of each doc to a category # update pi count = dict(c.map(lambda cat: (cat, 1)).reduceByKey(add).takeOrdered( 20)) # *** this is 'a' (vector of size 20) in the PDF # Now, we update the alpha new_alpha = [0] * 20 for i in range(20): if i in count: new_alpha[i] = alpha[i] + count[ i] # *** count[i], where i is the key else: new_alpha[i] = alpha[i]
def assignCategory(x, i, c): if x[1] == i: return c else: return x[0] for num_iter in range(200): print(num_iter) # update c logPi = np.log(pi) probs = x.map(lambda x_i: getProbs(False, log_mu, x_i, logPi)).collect() # print(len(probs)) # 19997 # Now we need to asign and find out to which category goes each document c_local = [np.nonzero(multinomial(1, prob))[0][0] for prob in probs] # print(len(c_local)) # 19997 c = x.zipWithIndex().map(lambda tup: c_local[tup[1]]) #make it eager # update pi count = c.map(lambda cat: (cat, 1)).reduceByKey(add).sortByKey( ascending=True).collectAsMap() # Now, we update the alpha new_alpha = [0] * 20 for i in range(20): if i in count: new_alpha[i] = alpha[i] + count[i] else:
def __call__( self, nothing=None ) : nsample = randint( 1, self.Msamples ) fnsample = float(nsample) sample = multinomial( nsample, self.pvector )/fnsample #print sample return sample