def generate_corpus(alpha, m, beta, n, D, Nd): """ Returns a grouped corpus drawn from a mixture of Dirichlet--multinomial unigram language models. Arguments: alpha -- concentration parameter for the Dirichlet prior over theta m -- T-dimensional mean of the Dirichlet prior over theta beta -- concentration parameter for the Dirichlet prior over phis n -- V-dimensional mean of the Dirichlet prior over phis D -- number of documents to generate Nd -- number of tokens to generate per document """ corpus = GroupedCorpus() theta = dirichlet(alpha * array(m), 1) phis = dirichlet(beta * array(n), len(m)) for d in xrange(D): [t] = sample(theta, 1) corpus.add(str(d), str(t), [str(v) for v in sample(phis[t, :], Nd)]) return corpus
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J): # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected # (update r log exp problem) alphas = [random(K,),random(L,)] alphas[0] = alphas[0]/np.sum(alphas[0]) alphas[1] = alphas[1]/np.sum(alphas[1]) gammas = [randint(low = 50, high = 500, size = (M,K)) + random((M,K)), randint(low = 1.46, high = 3, size = (N,L)) + random((N,L))] beta_shape = (K,L,1 + X1.shape[1] + X2.shape[1]) sigmaY_shape = (K,L) #randint(low = -1, high = 1, size = beta_shape) + betas = [random(beta_shape), randint(low = 10, high = 50, size = sigmaY_shape) + random(sigmaY_shape)] r1 = dirichlet(alphas[0], no_obs) r1[r1<1e-4] = 1e-4 #r1[r1>0.99] = 0.9 r2 = dirichlet(alphas[1], no_obs) r2[r2<1e-6] = 1e-6 #r2[r2>0.9] = 0.9 r = [r1,r2] ones = np.ones((len(train_I),)) mu = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(1) mv = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(0) mu[mu<1] = 1 mv[mv<1] = 1 for k in range(K): gammas[0][:,k] = alphas[0][k] + np.array(np.divide(sp.csr_matrix((r1[:,k],(train_I,train_J)),shape=(M,N)).sum(1),mu).flatten())[0] # M x K for l in range(L): gammas[1][:,l] = alphas[1][l] + np.array(np.divide(sp.csr_matrix((r2[:,l],(train_I,train_J)),shape=(M,N)).sum(0),mv).transpose().flatten())[0] # N x L return alphas, gammas, betas, r
def generate_corpus(alpha, m, beta, n, D, Nd): """ Returns a grouped corpus drawn from a mixture of Dirichlet--multinomial unigram language models. Arguments: alpha -- concentration parameter for the Dirichlet prior over theta m -- T-dimensional mean of the Dirichlet prior over theta beta -- concentration parameter for the Dirichlet prior over phis n -- V-dimensional mean of the Dirichlet prior over phis D -- number of documents to generate Nd -- number of tokens to generate per document """ T = len(m) V = len(n) corpus = GroupedCorpus() theta = dirichlet(alpha*m) zVector = sample(theta,D) #generate group type Zd for each docutment d allPhis = [dirichlet(beta*n) for t in xrange(T)] allWords = zeros(Nd*D) for d in xrange(D): for n in xrange(Nd): allWords[d*Nd+n] = sample(allPhis[zVector[d]]) print allWords return allWords,zVector
def iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T): """ Performs a single iteration of Radford Neal's Algorithm 8. """ for t in active_topics: phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V) for d in xrange(D): old_t = z_D[d] if inv_z_T is not None: inv_z_T[old_t].remove(d) N_TV[old_t, :] -= N_DV[d, :] N_T[old_t] -= N_D[d] D_T[old_t] -= 1 seterr(divide='ignore') log_dist = log(D_T) seterr(divide='warn') idx = -1 * ones(M, dtype=int) idx[0] = old_t if D_T[old_t] == 0 else inactive_topics.pop() for m in xrange(1, M): idx[m] = inactive_topics.pop() active_topics |= set(idx) log_dist[idx] = log(alpha) - log(M) if idx[0] == old_t: phi_TV[idx[1:], :] = dirichlet(beta * ones(V) / V, M - 1) else: phi_TV[idx, :] = dirichlet(beta * ones(V) / V, M) for t in active_topics: log_dist[t] += (N_DV[d, :] * log(phi_TV[t, :])).sum() [t] = log_sample(log_dist) z_D[d] = t if inv_z_T is not None: inv_z_T[t].add(d) N_TV[t, :] += N_DV[d, :] N_T[t] += N_D[d] D_T[t] += 1 idx = set(idx) idx.discard(t) active_topics -= idx inactive_topics |= idx
def make_artificial_data(M,N,D1,D2,K,L,no_obs): X1 = random((M,D1)) X2 = random((N,D2)) Xs = [X1,X2] beta = randint(low = 0, high = 10, size = (K,L,1 + X1.shape[1] + X2.shape[1])) + random((K,L,1 + X1.shape[1] + X2.shape[1])) sigmaY = randint(low = 0, high = 10, size = (K,L)) + random((K,L)) alphas = [randint(low = 0, high = 10000, size = (K,)) + random(K,), randint(low = 0, high = 1000, size = (L,)) + random(L,)] alpha1 = alphas[0] alpha2 = alphas[1] pi1 = dirichlet(alpha1,(M,1)) pi2 = dirichlet(alpha2,(N,1)) z1 = sample_discrete(pi1,(M,1)) z2 = sample_discrete(pi2,(N,1)) made_ij = False I = [] J = [] prev_len = 0 while made_ij == False: I.extend(randint(low = 0,high = M, size = (no_obs - prev_len,))) J.extend(randint(low = 0,high = N, size = (no_obs - prev_len,))) W = sp.csr_matrix((np.ones(no_obs),(I,J)), shape=(M,N)) I,J = sp.find(W)[:2] I = list(I) J = list(J) if len(I) == no_obs: made_ij = True else: prev_len = len(I) Xbias = np.ones((no_obs,1)) # |Yobs| x 1 Xusers = X1[I,:].reshape((no_obs,D1)) # |Yobs| x D1 Xitems = X2[J,:].reshape((no_obs,D2)) # |Yobs| x D2 X = np.hstack((Xbias, Xusers, Xitems)) # |Yobs| x (1 + D1 + D2) Y = np.zeros((no_obs,)) # |Yobs| x 1 for o in range(no_obs): Y[o] = sigmaY[int(z1[I[o]][0]),int(z2[J[o]][0])] * randn() + np.dot(beta[int(z1[I[o]][0]),int(z2[J[o]][0]),:],X[o,:]) pis = [pi1,pi2] zs = [z1,z2] betas = [beta,sigmaY] params = {'alphas':alphas, 'pis':pis, 'zs':zs, 'betas':betas } return Xs, Y, I, J, params
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J): # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected # (update r log exp problem) alphas = [random(K), random(L)] alphas[0] = alphas[0] / np.sum(alphas[0]) alphas[1] = alphas[1] / np.sum(alphas[1]) gammas = [ randint(low=50, high=500, size=(M, K)) + random((M, K)), randint(low=1.46, high=3, size=(N, L)) + random((N, L)), ] beta_shape = (K, L, 1 + X1.shape[1] + X2.shape[1]) sigmaY_shape = (K, L) # randint(low = -1, high = 1, size = beta_shape) + betas = [random(beta_shape), randint(low=10, high=50, size=sigmaY_shape) + random(sigmaY_shape)] m1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1])) m2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1])) sigma1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1])) sigma2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1])) theta1 = [m1, sigma1] theta2 = [m2, sigma2] thetas = [theta1, theta2] r1 = dirichlet(alphas[0], M) r1[r1 < 1e-4] = 1e-4 # r1[r1>0.99] = 0.9 r2 = dirichlet(alphas[1], N) r2[r2 < 1e-6] = 1e-6 # r2[r2>0.9] = 0.9 r = [r1, r2] ones = np.ones((len(train_I),)) mu = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(1) mv = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(0).transpose() mu[mu < 1] = 1 mv[mv < 1] = 1 s1 = dirichlet(alphas[0], M) s1[s1 < 1e-4] = 1e-4 # r1[r1>0.99] = 0.9 s2 = dirichlet(alphas[1], N) s2[s2 < 1e-6] = 1e-6 # r2[r2>0.9] = 0.9 s = [s1, s2] gammas[0] = np.tile(alphas[0].reshape(1, K), (M, 1)) + s[0] + np.multiply(r[0], mu) # M x K gammas[1] = np.tile(alphas[1].reshape(1, L), (N, 1)) + s[1] + np.multiply(r[1], mv) # N x L return alphas, gammas, betas, thetas, r, s
def generate_mmsbm_data(N, K, alpha, a, b, m=None): """ N is the number of nodes K is the number of blocks alpha is the concentration parameter a and b are the shape parameters m is the base measure """ if m == None: m = ones(K) / K # uniform base measure Y = zeros((N, N), dtype=int) # edges # sample node-specific distributions over blocks [theta] = dirichlet(alpha * m, (1, N)) # sample between- and within-block edge probabilities phi = beta(a, b, (K, K)) # sample block assignments and edges for i in range(1, N+1): for j in range(1, N+1): idx = (categorical(theta[i-1,:]), categorical(theta[j-1,:])) Y[i-1,j-1] = uniform() <= phi[idx] return theta, Y
def sample_dirichlet_from_dict(dt): ''' Sample one set or dirichlet distribution for given dictionary ''' alphas = dt.values() raw_dist = dirichlet(alphas) return dict( zip((dt.keys()), (raw_dist)) )
def generate_sbm_data(N, K, alpha, a, b, m=None): """ N is the number of nodes K is the number of blocks alpha is the concentration parameter a and b are the shape parameters m is the base measure """ if m == None: m = ones(K) / K # uniform base measure Z = zeros((N, K)) # block assignments # sample (global) distribution over blocks [theta] = dirichlet(alpha * m, 1) # sample between- and within-block edge probabilities phi = beta(a, b, (K, K)) # sample block assignments for n in range(1, N+1): Z[n-1,:] = multinomial(1, theta) # sample edges Y = (uniform(size=(N, N)) <= dot(dot(Z, phi), Z.T)).astype(int) return Z, Y
def generate_docs(phi, ndocs, nwords_per_doc, alpha=0.1, p0=0.8): K, V = phi.shape theta = np.zeros((ndocs, K), dtype=float) switch = np.append([0], binomial(1, p0, ndocs - 1)) switch = switch == 0 samples = dirichlet([alpha] * K, size=int(switch.sum())) theta[switch] = samples last_theta = None for t in xrange(0, ndocs): if switch[t] == True: last_theta = theta[t] continue theta[t] = last_theta def gen_z(theta): z = np.repeat(np.arange(K), multinomial(nwords_per_doc, theta, size=1)[0]) np.random.shuffle(z) return z z = np.apply_along_axis(gen_z, 1, theta) def gen_w(z): return np.random.multinomial(1, phi[z]).nonzero()[0][0] w = np.vectorize(gen_w)(z) return w, z, theta, switch
def theta_t(th, n, p): pt = pt_t(th, n, p) if binomial(1, pt) == 1: return (th, pt, np.log(pt)) tt = dirichlet(alpha + n, 1)[0] return (tt, pt, np.log(1-pt) + dir_logpdf(tt, alpha + n))
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100, alpha=None, noise=-1, plsi=False): p = Poisson(ratings_per_user) ratings = [[rint(1,5) for i in range(num_items)] for i in range(num_types)] if alpha == None: alpha = [1]*num_types user_ratings = [] user_indices = [] type_dists = [] for i in range(num_users): ratings_per_user = p.sample() if plsi: type_dist = normalize([rand() for t in range(num_types)]) else: type_dist = dirichlet(alpha) type_dists.append(type_dist) rating = [] indices = [] for j in rsample(range(num_items), ratings_per_user): if rand() < noise: rating.append(rint(1,5)) else: type = sample(type_dist) rating.append(ratings[type][j]) indices.append(j) user_ratings.append(rating) user_indices.append(indices) user_ratings = user_indices, user_ratings return user_ratings, ratings, type_dists
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None): """ Nonconjugate split-merge. """ M = 10 # number of auxiliary samples D, V = N_DV.shape T = D + M - 1 # maximum number of topics N_D = N_DV.sum(1) # document lengths phi_TV = zeros((T, V)) # topic parameters inv_z_T = defaultdict(set) for d in xrange(D): inv_z_T[z_D[d]].add(d) # inverse mapping from topics to documents active_topics = set(unique(z_D)) inactive_topics = set(xrange(T)) - active_topics N_TV = zeros((T, V), dtype=int) N_T = zeros(T, dtype=int) for d in xrange(D): N_TV[z_D[d], :] += N_DV[d, :] N_T[z_D[d]] += N_D[d] D_T = bincount(z_D, minlength=T) # intialize topic parameters (necessary for Metropolis-Hastings only) for t in active_topics: phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V) for itn in xrange(num_itns): for _ in xrange(3): iteration( V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6 ) algorithm_8_iteration( V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T ) if true_z_D is not None: v = vi(true_z_D, z_D) print "Itn. %d" % (itn + 1) print "%d topics" % len(active_topics) print "VI: %f bits (%f bits max.)" % (v, log2(D)) if v < 1e-6: break return phi_TV, z_D
def get_fake_data(k, N): """ k : number of categories and alpha parameters N : number of proportions in the training set """ true_alphas = array([10, 5, 1, 1, 1, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2]) D = dirichlet(true_alphas, N) # training set return D
def generateMarkovChain(sample_names): chain = {} for sample in sample_names: sample_probs = [] chain[sample] = sample_probs random_distribution = dirichlet([1] * len(sample_names)) for i, sample_child in enumerate(sample_names): song_prob = (sample_child, random_distribution[i]) sample_probs.append(song_prob) return chain
def get_dists(dim, num): """ Returns an array of discrete distributions. Arguments: dim -- dimensionality of the distributions num -- number of distributions to return """ return dirichlet(ones(dim), num)
def generate(num_seq, seq_length, alphabet, m_word_length, m_word_param, background_param): magic_thetas = [dirichlet(m_word_param) for j in range(m_word_length)] background_theta = dirichlet(background_param) sequences = [] starts = [] for k in range(num_seq): background_onehots = [multinomial(1, background_theta) for x in range(seq_length - m_word_length)] background = [alphabet[t] for t in [i.tolist().index(1) for i in background_onehots]] #background = [alphabet[t].lower() for t in [i.tolist().index(1) for i in background_onehots]] magic_onehots = [multinomial(1, theta) for theta in magic_thetas] magic_word = [alphabet[j] for j in [i.tolist().index(1) for i in magic_onehots]] start_pos = randint(seq_length - m_word_length) background[start_pos : start_pos] = magic_word sequences.append(background) starts.append(start_pos) #print starts ans = [] ans.append(starts) ans.append(sequences) return ans
def randommodel(self, numstates, alphabet): I = array(dirichlet([1] * numstates)) F = array([0.0] * numstates) S = [] # F is treated as an end of string symbol for i in range(numstates): probs = dirichlet([1] * (alphabet + 1)) newrow = array(probs[0:alphabet]) self.normalize(newrow) S.append(newrow) F[i] = probs[alphabet] T = [] for i in range(alphabet): T.append([]) for j in range(numstates): newrow = array(dirichlet([1] * numstates)) T[i].append(newrow) return (I, F, S, T)
def rand_init_param(self): logging.debug('Random param with seed: %s' % os.getpid()) self.factors = [list() for _ in self.parts] # init cluster prior for p, prob in enumerate(dirichlet([HardEM.CLUSTER_PRIOR_ALPHA] * len(self.parts))): self.factors[p].append(ClusterPrior(prob)) # init other singleton potential factors for p in self.parts: factors = self.factors[p] # factors.append(Binary_FC('isRealName', self.author_graph)) # factors.append(Norm_FC('revLen', self.author_graph, (3, 7))) factors.append(ProdsFC('prProds', self.author_graph, self.author_product_map)) factors.append(MembsFC('prMembs', self.author_graph))
def sample(self,m): """ Samples m samples from the current Dirichlet distribution. :param m: Number of samples to draw. :type m: int. :returns: A Data object containing the samples :rtype: natter.DataModule.Data """ return Data(dirichlet(tuple(self.param['alpha']),m).transpose(),str(m) + ' samples from ' + self.name)
def generate_corpus(beta, mean, N): """ Returns a corpus of tokens drawn from a Dirichlet--multinomial unigram language model. Each token is an instance of one of V unique word types, represented by indices 0, ..., V - 1. Arguments: beta -- concentration parameter for the Dirichlet prior mean -- V-dimensional mean of the Dirichlet prior N -- number of tokens to generate """ return sample(dirichlet(beta * array(mean), 1), N)
def comp_fractions(counts, method="dirichlet", **kwargs): """ Covert counts to fraction using given method. Parameters ---------- method : string {dirichlet (default) | normalize | pseudo} dirichlet - randomly draw from the corresponding posterior Dirichlet distribution with a uniform prior. That is, for a vector of counts C, draw the fractions from Dirichlet(C+1). normalize - simply divide each row by its sum. pseudo - add given pseudo count (defualt 1) to each count and do simple normalization. KW Arguments ------------ p_counts : int/float (default 1) The value of the pseudo counts to add to all counts. Used only if method is dirichlet Returns ------- fracs: CompData Component fractions as a compositional data object. """ from Compositions import CompData n, m = np.shape(counts) if method == "dirichlet": from numpy.random.mtrand import dirichlet fracs = CompData(np.ones((n, m))) method = method.lower() for i in xrange(n): # for each sample C = counts[i, :] # counts of each otu in sample a = C + 1 # dirichlet parameters fracs[i, :] = dirichlet(a) elif method == "normalize": temp = counts.T fracs = CompData((temp / temp.sum()).T) elif method is "pseudo": p_counts = kwargs.pop("p_counts", 1) fracs = comp_fractions(counts + p_counts, method="normalize") else: raise ValueError, 'Unsupported method "%s"' % method return fracs
def init_params(K, L, M, N, X1, X2, no_obs): # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected # (update r log exp problem) alphas = [random(K,),random(L,)] alphas[0] = alphas[0]/np.sum(alphas[0]) alphas[1] = alphas[1]/np.sum(alphas[1]) gammas = [randint(low = 50, high = 500, size = (M,K)) + random((M,K)), randint(low = 1.46, high = 3, size = (N,L)) + random((N,L))] beta_shape = (K,L,1 + X1.shape[1] + X2.shape[1]) sigmaY_shape = (K,L) #randint(low = -1, high = 1, size = beta_shape) + betas = [random(beta_shape), randint(low = 10, high = 50, size = sigmaY_shape) + random(sigmaY_shape)] r1 = dirichlet(alphas[0], M) r1[r1<1e-4] = 1e-4 #r1[r1>0.99] = 0.9 r2 = dirichlet(alphas[1], N) r2[r2<1e-6] = 1e-6 #r2[r2>0.9] = 0.9 r = [r1,r2] gammas[0] = np.tile(alphas[0].reshape(1,K), (M,1)) + r[0] # M x K gammas[1] = np.tile(alphas[1].reshape(1,L), (N,1)) + r[1] # N x L return alphas, gammas, betas, r
def copy(self, t, models, parents): """ In the exemplar based model, individuals copy the trait of other individuals on the basis of the extrinsic properties of those individuals. With a probability of C, individuals will be biased towards copying from such individuals; the rest (1 - C) copies unbiased. """ if t > 0: biased_parents = self.rnd.choice(self.N, size=self.N, p=dirichlet([self.alpha] * self.N)) biased = self.rnd.rand(self.N) < self.C biased_parents[~biased] = parents[~biased] self.population = self.population[biased_parents] self.parents[t] = biased_parents else: self.population = models self.parents[t] = parents
def __init__(self, params): # The word distribution of this node's topic. self.word_dist = dirichlet(params["topic_to_word_param"]) self.word_cdf = util.get_cdf(self.word_dist) # The number of documents that pass through this node. self.num_documents = 0 # Those children of this node which have looked below this level. # Documents that reached this node but never looked below aren't # represented here; this is okay because the Chinese Restaurant # Process is exchangeable (doesn't depend on order). self.children = [] # The number of documents which looked below this level. This # should always be equal to sum(c.num_documents for c in # self.children). self.num_documents_in_children = 0
def generate_data(V, D, l, alpha, beta): """ Generates a synthetic corpus of documents from a Dirichlet process mixture model with multinomial mixture components (topics). The mixture components are drawn from a symmetric Dirichlet prior. Arguments: V -- vocabulary size D -- number of documents l -- average document length alpha -- concentration parameter for the Dirichlet process beta -- concentration parameter for the symmetric Dirichlet prior """ T = D # maximum number of topics phi_TV = zeros((T, V)) z_D = zeros(D, dtype=int) N_DV = zeros((D, V), dtype=int) for d in xrange(D): # draw a topic assignment for this document dist = bincount(z_D).astype(float) dist[0] = alpha [t] = sample(dist) t = len(dist) if t == 0 else t z_D[d] = t # if it's a new topic, draw the parameters for that topic if t == len(dist): phi_TV[t - 1, :] = dirichlet(beta * ones(V) / V) # draw the tokens from the topic for v in sample(phi_TV[t - 1, :], num_samples=poisson(l)): N_DV[d, v] += 1 z_D = z_D - 1 return phi_TV, z_D, N_DV
def E_step(w, phi, alpha, beta, p0, L, n=None, theta=None, maxiter=100, resampling=True, smoothing=False): T, N = w.shape K, V = phi.shape log_theta = None if n == None: n = dirichlet(alpha, size=T) * N if theta == None: theta, pt, log_theta = q_theta(n, alpha, p0, L) # if theta == None: # theta = dirichlet(alpha, size=T) # log_theta = np.log(theta) # if n == None: # n = theta * N log_phi = np.log(phi) pt = None likelihood_log = np.zeros(maxiter, dtype=float) theta_log = np.zeros((maxiter, T, K), dtype=float) for iteration in xrange(maxiter): z = q_z(w, log_theta, phi) #z = q_z_alt(w, theta, phi) n = z.sum(axis=1) new_theta, pt, new_log_theta = q_theta(n, alpha, p0, L, resampling=resampling, smoothing=smoothing) #set_trace() diff = np.abs(theta - new_theta) avg_diff = diff.mean() max_diff = diff.max() likelihood_log[iteration] = likelihood(w, z, np.log(new_theta), log_phi) print 'iteration %d. avg diff: %f. max diff: %f. likelihood: %f' %\ (iteration, avg_diff, max_diff, likelihood_log[iteration]) theta_log[iteration] = theta log_theta = new_log_theta theta = new_theta return z, theta, pt, likelihood_log, theta_log
def computePrior(options): num = options.num clusterAlpha = options.clusterAlpha balanced = options.balanced numClusters = options.numClusters assert (num > 0) if numClusters is None: numClusters = max(int(log(num, 1.5)), 2) print(sys.stderr, numClusters, "clusters") clusterPrior = [clusterAlpha] * numClusters if not balanced: clusterPrior = dirichlet(clusterPrior) else: norm = sum(clusterPrior) clusterPrior = [xx / norm for xx in clusterPrior] return clusterPrior
def computePrior(options): num = options.num clusterAlpha = options.clusterAlpha balanced = options.balanced numClusters = options.numClusters assert(num > 0) if numClusters is None: numClusters = max(int(log(num, 1.5)), 2) print(sys.stderr, numClusters, "clusters") clusterPrior = [clusterAlpha] * numClusters if not balanced: clusterPrior = dirichlet(clusterPrior) else: norm = sum(clusterPrior) clusterPrior = [xx/norm for xx in clusterPrior] return clusterPrior
def prob5(data, iters=5000): post = [] post_temp = namedtuple('post', 'style, abv, post_prob') for label, vals in data.items(): counts = Counter(vals) freq_data = [(i, j) for i, j in counts.items()] keys = [i[0] for i in freq_data] obs = [i[1] for i in freq_data] results = [] for n in xrange(iters): samp = dirichlet([x for x in obs]) results.append(samp) results = np.array(results) probs = np.mean(results, axis=0) n_data = zip(keys, probs) for key, prob in n_data: if prob is not None and key == 5: datum = post_temp(style=label[1], abv=label[0], post_prob=prob) post.append(datum) return post
def to_fractions(self, method='dirichlet', **kwargs): ''' Convert counts to fraction, either by simple normalization or adding pseudo counts, or dirichlet sampling. If dirichlet sampling is used, for each sample (col) fit a dirichlet distribution and sample the fraction from it. The prior is a uniform dirichlet (a = ones(len(otus)) ) Return a new instance. ''' if method is 'normalize': fracs = self.normalize() elif method is 'pseudo': p_counts = kwargs.get('p_counts', 1) fracs = (self + p_counts).normalize() else: from numpy.random.mtrand import dirichlet mat, row_labels, col_labels = self.to_matrix() for i in range(len(col_labels)): # for each sample N = mat[:, i] # counts of each otu in sample a = N + 1 # dirichlet parameters f = dirichlet(a) # fractions are random sample from dirichlet mat[:, i] = f fracs = self.remove_rows(self.row_labels()) fracs.from_matrix(mat, row_labels, col_labels) return fracs
def to_fractions(self, method = 'dirichlet', **kwargs): ''' Convert counts to fraction, either by simple normalization or adding pseudo counts, or dirichlet sampling. If dirichlet sampling is used, for each sample (col) fit a dirichlet distribution and sample the fraction from it. The prior is a uniform dirichlet (a = ones(len(otus)) ) Return a new instance. ''' if method is 'normalize': fracs = self.normalize() elif method is 'pseudo': p_counts = kwargs.get('p_counts',1) fracs = (self+p_counts).normalize() else: from numpy.random.mtrand import dirichlet mat, row_labels, col_labels = self.to_matrix() for i in range(len(col_labels)): # for each sample N = mat[:,i] # counts of each otu in sample a = N+1 # dirichlet parameters f = dirichlet(a) # fractions are random sample from dirichlet mat[:,i] = f fracs = self.remove_rows(self.row_labels()) fracs.from_matrix(mat, row_labels, col_labels) return fracs
def generate_data(T, K, beta, n=None): """ T is the number of timesteps K is the dimensionality beta is the concentration parameter n is the base measure """ if n == None: n = ones(K) / K # uniform base measure x = zeros(T) # observations r = zeros(T) # run lengths C = [] # changepoints for t in range(1, T + 1): # sample run length if t == 1: r[t - 1] = 0 else: if uniform() < hazard([r[t - 2] + 1]): r[t - 1] = 0 else: r[t - 1] = r[t - 2] + 1 # sample new parameters if run length is zero if r[t - 1] == 0: C.append(t) [phi] = dirichlet(beta * n, 1) x[t - 1] = categorical(phi) # sample data return x, r, C
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100, alpha=None, noise=-1, plsi=False): p = Poisson(ratings_per_user) ratings = [[rint(1, 5) for i in range(num_items)] for i in range(num_types)] if alpha == None: alpha = [1] * num_types user_ratings = [] user_indices = [] type_dists = [] for i in range(num_users): ratings_per_user = p.sample() if plsi: type_dist = normalize([rand() for t in range(num_types)]) else: type_dist = dirichlet(alpha) type_dists.append(type_dist) rating = [] indices = [] for j in rsample(range(num_items), ratings_per_user): if rand() < noise: rating.append(rint(1, 5)) else: type = sample(type_dist) rating.append(ratings[type][j]) indices.append(j) user_ratings.append(rating) user_indices.append(indices) user_ratings = user_indices, user_ratings return user_ratings, ratings, type_dists
def rand_init(self): p = np.clip(dirichlet([1] * 2)[0], EPS, 1 - EPS) self.log_val = np.log(p) self.log_1_val = np.log(1 - p)
def rand_init(self): self.log_pr_prod = np.log( dirichlet([HardEM.PROD_PRIOR_ALPHA] * self.n_all_membs)) # near uniform initialization
count += 1 idx = top20k.index(w) if idx in numwords: numwords[idx] += 1 else: numwords[idx] = 1 return (header, numwords, count) result = lines.map(countWords) result.cache() alpha = [0.1] * 20 beta = np.array([0.1] * 20000) pi = dirichlet(alpha).tolist() # *** vector gives prevalence of each category mu = np.array([ dirichlet(beta) for j in range(20) ]) # *** prob vector prevelence of each word of category in each doc log_mu = np.log(mu) header = result.map(lambda x: x[0]).collect() x = result.map(lambda x: x[1]).map( map_to_array).cache() # *** Num of occurance of each word in each doc. # getProbs accepts four parameters: # # checkParams: set to true if you want a check on all of the params # that makes sure that everything looks OK. This will make the # function run slower; use only for debugging #
def sample_post(hp, ss): return dirichlet(ss.counts + hp.alphas)
def gibbs_sep_doc(alpha,Pi,A,word_doc,z_doc,doc_list,\ z_count_all,p_md_all,E_theta_all,E_m_d_theta_all,\ X,no_of_itr,vocabSize,proc_id): no_of_topics = A.shape[1] K = alpha.shape[0] # No of mixture components multi_rand = np.random.multinomial(1,[1/no_of_topics]*no_of_topics,\ size=(vocabSize)) z_init = np.array([mat.argmax() for mat in multi_rand]) theta = np.ones([no_of_topics])*(1/no_of_topics) numdocs = doc_list.shape[0] p_M = np.zeros([K]) E_theta = np.zeros([no_of_topics]) E_m_d_theta = np.zeros([K,no_of_topics]) z_count = np.zeros([vocabSize,no_of_topics]) # iterating through every document idx = 0 for doc_index in doc_list: word_indices = word_doc[idx] z_d = z_doc[idx] E_theta.fill(0) E_m_d_theta.fill(0) p_M.fill(0) z_count.fill(0) for i in range(no_of_itr + X): # Sampling the mixture component p_theta_gm = np.empty(K) for k in range(K): p_theta_gm[k] = dirichlet_log_prob(theta,alpha[k]) log_p_md = log_np_array(Pi) + p_theta_gm norm_log_pmd = (log_p_md - compute_log_sum(log_p_md)) p_md = np.array([math.exp(m) for m in norm_log_pmd]) p_md = p_md/p_md.sum() M = np.random.multinomial(1,p_md,size=1).argmax() # Sampling theta alpha_d = alpha[M] topics_count = count_topics(z_d, word_indices, no_of_topics) alpha_p = alpha_d + topics_count local_theta = dirichlet(alpha_p, size=1) local_theta = np.array([remove_zero(m) for m in local_theta[0]]) theta = local_theta/local_theta.sum() w_count = 0 for w_index in word_indices: # iterating through every word in document to sample z p_zd = A[w_index] * theta p_zd = p_zd/p_zd.sum() word_topic = np.random.multinomial(1,p_zd,size=1).argmax() z_d[w_count] = word_topic w_count = w_count + 1 if (i >= X): z_count[w_index, word_topic] += 1 # Saving only theta and M samples from every iteration # only burning-in if (i >= X): p_M[M] += 1 E_theta += theta E_m_d_theta[M] += log(theta) idx += 1 # Communicating the array's back to the parent process z_count_all[doc_index] = z_count p_md_all[doc_index] = p_M/no_of_itr E_theta_all[doc_index] = E_theta/no_of_itr E_m_d_theta_all[doc_index] = E_m_d_theta/no_of_itr
def iteration(V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, num_inner_itns): """ Performs a single iteration of Metropolis-Hastings (split-merge). """ phi_s_V = empty(V) phi_t_V = empty(V) phi_merge_t_V = empty(V) N_s_V = empty(V, dtype=int) N_t_V = empty(V, dtype=int) N_merge_t_V = empty(V, dtype=int) log_dist = empty(2) d, e = choice(D, 2, replace=False) # choose 2 documents if z_D[d] == z_D[e]: s = inactive_topics.pop() active_topics.add(s) else: s = z_D[d] inv_z_s = set([d]) N_s_V[:] = N_DV[d, :] N_s = N_D[d] D_s = 1 t = z_D[e] inv_z_t = set([e]) N_t_V[:] = N_DV[e, :] N_t = N_D[e] D_t = 1 inv_z_merge_t = set([d, e]) N_merge_t_V[:] = N_DV[d, :] + N_DV[e, :] N_merge_t = N_D[d] + N_D[e] D_merge_t = 2 if z_D[d] == z_D[e]: idx = inv_z_T[t] - set([d, e]) else: idx = (inv_z_T[s] | inv_z_T[t]) - set([d, e]) for f in idx: if uniform() < 0.5: inv_z_s.add(f) N_s_V += N_DV[f, :] N_s += N_D[f] D_s += 1 else: inv_z_t.add(f) N_t_V += N_DV[f, :] N_t += N_D[f] D_t += 1 inv_z_merge_t.add(f) N_merge_t_V += N_DV[f, :] N_merge_t += N_D[f] D_merge_t += 1 if z_D[d] == z_D[e]: phi_merge_t_V[:] = phi_TV[t, :] else: phi_merge_t_V = dirichlet(N_merge_t_V + beta / V) acc = 0.0 for inner_itn in xrange(num_inner_itns): # sample new parameters for topics s and t ... but if it's the # last iteration and we're doing a merge, then just set the # parameters back to phi_TV[s, :] and phi_TV[t, :] if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]: phi_s_V[:] = phi_TV[s, :] phi_t_V[:] = phi_TV[t, :] else: phi_s_V = dirichlet(N_s_V + beta / V) phi_t_V = dirichlet(N_t_V + beta / V) if inner_itn == num_inner_itns - 1: acc += gammaln(N_s + beta) acc -= gammaln(N_s_V + beta / V).sum() acc += ((N_s_V + beta / V - 1) * log(phi_s_V)).sum() acc += gammaln(N_t + beta) acc -= gammaln(N_t_V + beta / V).sum() acc += ((N_t_V + beta / V - 1) * log(phi_t_V)).sum() acc -= gammaln(N_merge_t + beta) acc += gammaln(N_merge_t_V + beta / V).sum() acc -= ((N_merge_t_V + beta / V - 1) * log(phi_merge_t_V)).sum() for f in idx: # (fake) restricted Gibbs sampling scan if f in inv_z_s: inv_z_s.remove(f) N_s_V -= N_DV[f, :] N_s -= N_D[f] D_s -= 1 else: inv_z_t.remove(f) N_t_V -= N_DV[f, :] N_t -= N_D[f] D_t -= 1 log_dist[0] = log(D_s) log_dist[0] += (N_DV[f, :] * log(phi_s_V)).sum() log_dist[1] = log(D_t) log_dist[1] += (N_DV[f, :] * log(phi_t_V)).sum() log_dist -= log_sum_exp(log_dist) if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]: u = 0 if z_D[f] == s else 1 else: [u] = log_sample(log_dist) if u == 0: inv_z_s.add(f) N_s_V += N_DV[f, :] N_s += N_D[f] D_s += 1 else: inv_z_t.add(f) N_t_V += N_DV[f, :] N_t += N_D[f] D_t += 1 if inner_itn == num_inner_itns - 1: acc += log_dist[u] if z_D[d] == z_D[e]: acc *= -1.0 acc += log(alpha) acc += gammaln(D_s) + gammaln(D_t) - gammaln(D_T[t]) tmp = beta / V acc += gammaln(beta) - V * gammaln(tmp) acc += (tmp - 1) * (log(phi_s_V).sum() + log(phi_t_V).sum()) acc -= (tmp - 1) * log(phi_TV[t, :]).sum() acc += (N_s_V * log(phi_s_V)).sum() + (N_t_V * log(phi_t_V)).sum() acc -= (N_TV[t, :] * log(phi_TV[t, :])).sum() if log(uniform()) < min(0.0, acc): phi_TV[s, :] = phi_s_V phi_TV[t, :] = phi_t_V z_D[list(inv_z_s)] = s z_D[list(inv_z_t)] = t inv_z_T[s] = inv_z_s inv_z_T[t] = inv_z_t N_TV[s, :] = N_s_V N_TV[t, :] = N_t_V N_T[s] = N_s N_T[t] = N_t D_T[s] = D_s D_T[t] = D_t else: active_topics.remove(s) inactive_topics.add(s) else: acc -= log(alpha) acc += gammaln(D_merge_t) - gammaln(D_T[s]) - gammaln(D_T[t]) tmp = beta / V acc += V * gammaln(tmp) - gammaln(beta) acc += (tmp - 1) * log(phi_merge_t_V).sum() acc -= (tmp - 1) * (log(phi_TV[s, :]).sum() + log(phi_TV[t, :]).sum()) acc += (N_merge_t_V * log(phi_merge_t_V)).sum() acc -= ((N_TV[s, :] * log(phi_TV[s, :])).sum() + (N_TV[t, :] * log(phi_TV[t, :])).sum()) if log(uniform()) < min(0.0, acc): phi_TV[s, :] = zeros(V) phi_TV[t, :] = phi_merge_t_V active_topics.remove(s) inactive_topics.add(s) z_D[list(inv_z_merge_t)] = t inv_z_T[s].clear() inv_z_T[t] = inv_z_merge_t N_TV[s, :] = zeros(V, dtype=int) N_TV[t, :] = N_merge_t_V N_T[s] = 0 N_T[t] = N_merge_t D_T[s] = 0 D_T[t] = D_merge_t
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30, alpha=0.001, beta=0.01, noise=-1, plsi=False): """Generates documents according to plsi or lda Args: num_topics: the number of underlying latent topics num_docs: the number of documents to generate words_per_doc: parameter to a Poisson distribution; determines the average words in a documents vocab_size: the number of words in the vocabulary DIRICHLET PARAMETERS --------------------- Assumes symmetric dirichlet distributions (ie all elements in the parameter vector have the same value) --------------------- alpha: parameter to dirichlet distribution for topics beta: parameter to dirichlet distribution for words noise: given as a probability; each word will be replaced with a random word with noise probability plsi: flag to determine which distribution to draw from, a random distribution or a sample from a dirichlet distribution Returns: docs: the list of documents, each a list of words (represented by their indices in range(vocab_size) word_dist: the distribution over words for each topic; each row is the distribution for a different topic topics_dist: the distribution over topics for each document; each row is the distribution for a different document """ p = Poisson(words_per_doc) alpha = [alpha] * num_topics beta = [beta] * num_topics if plsi: word_dist = [ normalize([rand() for w in range(vocab_size)]) for t in range(num_topics) ] else: word_dist = [dirichlet(beta) for i in range(num_topics)] word_cdfs = [] for topic in word_dist: word_cdfs.append(get_cdf(topic)) topic_cdfs = [] docs = [] topic_dists = [] doc_index = 0 for i in range(num_docs): if doc_index % 100 == 0: print "reached document", doc_index words_per_doc = p.sample() doc = [] if plsi: topic_dist = normalize([rand() for t in range(num_topics)]) else: topic_dist = dirichlet(alpha) topic_dists.append(topic_dist) topic_cdf = get_cdf(topic_dist) topic_cdfs.append(topic_cdf) for word in range(words_per_doc): if rand() < noise: doc.append(rsample(range(vocab_size), 1)) else: topic = sample(topic_cdf) doc.append(sample(word_cdfs[topic])) docs.append(doc) doc_index += 1 return docs, word_dist, topic_dists
def dir_fun(x): a = x+p_counts f = dirichlet(a) return f
def _sample_post(hp, ss): values = (hp.betas * hp.alpha).tolist() for i, count in ss.counts.iteritems(): values[i] += count values.append(hp.beta0 * hp.alpha) return dirichlet(values)
write_proto("%s-%i.index" % (filename, div), c) if __name__ == "__main__": flags.InitFlags() beta = {} eta = zeros(flags.num_topics) vocab_total = defaultdict(int) for ii in xrange(flags.num_topics): eta[ii] = ii + random() * float(ii) for ll in xrange(flags.num_langs): print ml_vocab[ii] print ml_vocab[ii][ll] gamma = [flags.gamma] * len(ml_vocab[ii][ll]) beta[(ll, ii)] = dirichlet(gamma) print "BETA", (ll, ii), beta[(ll, ii)] theta = {} alpha = [flags.alpha / float(flags.num_topics)] * flags.num_topics docs = defaultdict(Doc) print "Variance", flags.variance, flags.variance > 0 for ll in xrange(flags.num_langs): for ii in [(ll, x) for x in xrange(flags.num_docs)]: z_bar = zeros(flags.num_topics) theta[ii] = DirichletDraw(alpha) docs[ii].lang = ll docs[ii].theta = theta[ii]
i = 0 while i < 20000: if i in mapping: count_lst[i] = mapping[i] i += 1 return np.array(count_lst) # calculate term frequency vector for each document result = lines.map(countWords) result.cache() alpha = [0.1] * 20 beta = np.array([0.1] * 20000) pi = dirichlet(alpha).tolist() mu = np.array([dirichlet(beta) for j in range(20)]) log_mu = np.log(mu) header = result.map(lambda x: x[0]).collect() n = result.count() l = result.map(lambda x: x[2]).collect() x = result.map(lambda x: x[1]).map(map_to_array).cache() label = result.map(lambda x: (x[0], x[1])).map( lambda x: ((x[0][x[0].index('id="') + 18:x[0].index('url')]), x[1])).map( lambda x: (x[0][:x[0].index("/")], map_to_array(x[1]))).cache() labels = np.array(label.map(lambda x: x[0]).distinct().collect()) def getProbs(checkParams, log_allMus, x, log_pi):
bag_of_word_num.append(globalWords[word]) matrix = [] for glosa in querys: tok = token(glosa) arrQuery = [0]*len(bag_of_word) for j in tok: arrQuery[bag_of_word.index( j[0] )] = j[1] matrix.append(arrQuery) # random matrix Matrix_U = [] for i in range(0,len(matrix)): Ui = dirichlet([1] * 3) Matrix_U.append(Ui) c = FuzzyCMeans(matrix, Matrix_U) c(kmax) inf = 0 nav = 0 res = 0 categoryArray = [] for val in c.mu: cat = min_val(val[0], val[1], val[2]) categoryArray.append(cat) if cat == 'INF': inf += 1 elif cat == 'NAV': nav +=1
def DirichletDraw(alpha): t = dirichlet(alpha) while isnan(t[0]) or isinf(t[0]): t = dirichlet(alpha) return t