Exemplo n.º 1
def generate_corpus(alpha, m, beta, n, D, Nd):
    Returns a grouped corpus drawn from a mixture of
    Dirichlet--multinomial unigram language models.


    alpha -- concentration parameter for the Dirichlet prior over theta
    m -- T-dimensional mean of the Dirichlet prior over theta
    beta -- concentration parameter for the Dirichlet prior over phis
    n -- V-dimensional mean of the Dirichlet prior over phis
    D -- number of documents to generate
    Nd -- number of tokens to generate per document

    corpus = GroupedCorpus()

    theta = dirichlet(alpha * array(m), 1)
    phis = dirichlet(beta * array(n), len(m))

    for d in xrange(D):
        [t] = sample(theta, 1)
        corpus.add(str(d), str(t), [str(v) for v in sample(phis[t, :], Nd)])

    return corpus
Exemplo n.º 2
Arquivo: misc.py Projeto: neerajg/sdap
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J):
    # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected
    # (update r log exp problem)
    alphas = [random(K,),random(L,)]
    alphas[0] = alphas[0]/np.sum(alphas[0])
    alphas[1] = alphas[1]/np.sum(alphas[1])
    gammas = [randint(low = 50, high = 500, size = (M,K)) + random((M,K)), randint(low = 1.46, high = 3, size = (N,L)) + random((N,L))]
    beta_shape = (K,L,1 + X1.shape[1] + X2.shape[1])
    sigmaY_shape = (K,L)
    #randint(low = -1, high = 1, size = beta_shape) + 
    betas = [random(beta_shape), randint(low = 10, high = 50, size = sigmaY_shape) + random(sigmaY_shape)]  
    r1 = dirichlet(alphas[0], no_obs)
    r1[r1<1e-4] = 1e-4
    #r1[r1>0.99] = 0.9
    r2 = dirichlet(alphas[1], no_obs)
    r2[r2<1e-6] = 1e-6
    #r2[r2>0.9] = 0.9    
    r = [r1,r2]
    ones = np.ones((len(train_I),))
    mu = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(1)
    mv = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(0)
    mu[mu<1] = 1
    mv[mv<1] = 1
    for k in range(K):
        gammas[0][:,k] = alphas[0][k] + np.array(np.divide(sp.csr_matrix((r1[:,k],(train_I,train_J)),shape=(M,N)).sum(1),mu).flatten())[0] # M x K
    for l in range(L):    
        gammas[1][:,l] = alphas[1][l] + np.array(np.divide(sp.csr_matrix((r2[:,l],(train_I,train_J)),shape=(M,N)).sum(0),mv).transpose().flatten())[0] # N x L
    return alphas, gammas, betas, r
Exemplo n.º 3
Exemplo n.º 4
def iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T):
    Performs a single iteration of Radford Neal's Algorithm 8.

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for d in xrange(D):

        old_t = z_D[d]

        if inv_z_T is not None:

        N_TV[old_t, :] -= N_DV[d, :]
        N_T[old_t] -= N_D[d]
        D_T[old_t] -= 1

        log_dist = log(D_T)

        idx = -1 * ones(M, dtype=int)
        idx[0] = old_t if D_T[old_t] == 0 else inactive_topics.pop()
        for m in xrange(1, M):
            idx[m] = inactive_topics.pop()
        active_topics |= set(idx)
        log_dist[idx] = log(alpha) - log(M)

        if idx[0] == old_t:
            phi_TV[idx[1:], :] = dirichlet(beta * ones(V) / V, M - 1)
            phi_TV[idx, :] = dirichlet(beta * ones(V) / V, M)

        for t in active_topics:
            log_dist[t] += (N_DV[d, :] * log(phi_TV[t, :])).sum()

        [t] = log_sample(log_dist)

        z_D[d] = t

        if inv_z_T is not None:

        N_TV[t, :] += N_DV[d, :]
        N_T[t] += N_D[d]
        D_T[t] += 1

        idx = set(idx)
        active_topics -= idx
        inactive_topics |= idx
Exemplo n.º 5
def make_artificial_data(M,N,D1,D2,K,L,no_obs):
    X1 = random((M,D1))
    X2 = random((N,D2))
    Xs = [X1,X2]
    beta = randint(low = 0, high = 10, size = (K,L,1 + X1.shape[1] + X2.shape[1])) + random((K,L,1 + X1.shape[1] + X2.shape[1]))
    sigmaY = randint(low = 0, high = 10, size = (K,L)) + random((K,L))
    alphas = [randint(low = 0, high = 10000, size = (K,)) + random(K,), randint(low = 0, high = 1000, size = (L,)) + random(L,)]
    alpha1 = alphas[0]
    alpha2 = alphas[1]
    pi1 = dirichlet(alpha1,(M,1))
    pi2 = dirichlet(alpha2,(N,1))
    z1 = sample_discrete(pi1,(M,1))
    z2 = sample_discrete(pi2,(N,1))
    made_ij = False
    I = []
    J = []
    prev_len = 0
    while made_ij == False:  
        I.extend(randint(low = 0,high = M, size = (no_obs - prev_len,)))
        J.extend(randint(low = 0,high = N, size = (no_obs - prev_len,)))
        W = sp.csr_matrix((np.ones(no_obs),(I,J)), shape=(M,N))
        I,J = sp.find(W)[:2]
        I = list(I)
        J = list(J)
        if len(I) == no_obs:
            made_ij = True
            prev_len = len(I)
    Xbias = np.ones((no_obs,1)) # |Yobs| x 1
    Xusers = X1[I,:].reshape((no_obs,D1)) # |Yobs| x D1
    Xitems = X2[J,:].reshape((no_obs,D2)) # |Yobs| x D2
    X = np.hstack((Xbias, Xusers, Xitems)) # |Yobs| x (1 + D1 + D2)    
    Y = np.zeros((no_obs,)) # |Yobs| x 1
    for o in range(no_obs):
        Y[o] = sigmaY[int(z1[I[o]][0]),int(z2[J[o]][0])] * randn() + np.dot(beta[int(z1[I[o]][0]),int(z2[J[o]][0]),:],X[o,:])
    pis = [pi1,pi2]
    zs = [z1,z2]
    betas = [beta,sigmaY]
    params = {'alphas':alphas, 
    return Xs, Y, I, J, params
Exemplo n.º 6
Arquivo: misc.py Projeto: neerajg/sdap
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J):
    # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected
    # (update r log exp problem)
    alphas = [random(K), random(L)]
    alphas[0] = alphas[0] / np.sum(alphas[0])
    alphas[1] = alphas[1] / np.sum(alphas[1])
    gammas = [
        randint(low=50, high=500, size=(M, K)) + random((M, K)),
        randint(low=1.46, high=3, size=(N, L)) + random((N, L)),
    beta_shape = (K, L, 1 + X1.shape[1] + X2.shape[1])
    sigmaY_shape = (K, L)
    # randint(low = -1, high = 1, size = beta_shape) +
    betas = [random(beta_shape), randint(low=10, high=50, size=sigmaY_shape) + random(sigmaY_shape)]

    m1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1]))
    m2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1]))
    sigma1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1]))
    sigma2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1]))
    theta1 = [m1, sigma1]
    theta2 = [m2, sigma2]
    thetas = [theta1, theta2]

    r1 = dirichlet(alphas[0], M)
    r1[r1 < 1e-4] = 1e-4
    # r1[r1>0.99] = 0.9
    r2 = dirichlet(alphas[1], N)
    r2[r2 < 1e-6] = 1e-6
    # r2[r2>0.9] = 0.9
    r = [r1, r2]
    ones = np.ones((len(train_I),))
    mu = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(1)
    mv = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(0).transpose()
    mu[mu < 1] = 1
    mv[mv < 1] = 1

    s1 = dirichlet(alphas[0], M)
    s1[s1 < 1e-4] = 1e-4
    # r1[r1>0.99] = 0.9
    s2 = dirichlet(alphas[1], N)
    s2[s2 < 1e-6] = 1e-6
    # r2[r2>0.9] = 0.9
    s = [s1, s2]

    gammas[0] = np.tile(alphas[0].reshape(1, K), (M, 1)) + s[0] + np.multiply(r[0], mu)  # M x K
    gammas[1] = np.tile(alphas[1].reshape(1, L), (N, 1)) + s[1] + np.multiply(r[1], mv)  # N x L

    return alphas, gammas, betas, thetas, r, s
Exemplo n.º 7
def generate_mmsbm_data(N, K, alpha, a, b, m=None):
    N is the number of nodes
    K is the number of blocks
    alpha is the concentration parameter
    a and b are the shape parameters
    m is the base measure

    if m == None:
        m = ones(K) / K # uniform base measure

    Y = zeros((N, N), dtype=int) # edges

    # sample node-specific distributions over blocks

    [theta] = dirichlet(alpha * m, (1, N))

    # sample between- and within-block edge probabilities

    phi = beta(a, b, (K, K))

    # sample block assignments and edges

    for i in range(1, N+1):
        for j in range(1, N+1):
            idx = (categorical(theta[i-1,:]), categorical(theta[j-1,:]))
            Y[i-1,j-1] = uniform() <= phi[idx]

    return theta, Y
Exemplo n.º 8
def sample_dirichlet_from_dict(dt):
    Sample one set or dirichlet distribution for given dictionary
    alphas = dt.values()
    raw_dist = dirichlet(alphas)
    return dict( zip((dt.keys()), (raw_dist)) )
Exemplo n.º 9
def generate_sbm_data(N, K, alpha, a, b, m=None):
    N is the number of nodes
    K is the number of blocks
    alpha is the concentration parameter
    a and b are the shape parameters
    m is the base measure

    if m == None:
        m = ones(K) / K # uniform base measure

    Z = zeros((N, K)) # block assignments

    # sample (global) distribution over blocks

    [theta] = dirichlet(alpha * m, 1)

    # sample between- and within-block edge probabilities

    phi = beta(a, b, (K, K))

    # sample block assignments

    for n in range(1, N+1):
        Z[n-1,:] = multinomial(1, theta)

    # sample edges

    Y = (uniform(size=(N, N)) <= dot(dot(Z, phi), Z.T)).astype(int)

    return Z, Y
Exemplo n.º 10
Arquivo: tm.py Projeto: sbos/twtm
def generate_docs(phi, ndocs, nwords_per_doc, alpha=0.1, p0=0.8):
    K, V = phi.shape

    theta = np.zeros((ndocs, K), dtype=float)

    switch = np.append([0], binomial(1, p0, ndocs - 1))
    switch = switch == 0

    samples = dirichlet([alpha] * K, size=int(switch.sum()))
    theta[switch] = samples

    last_theta = None
    for t in xrange(0, ndocs):
        if switch[t] == True:
            last_theta = theta[t]

        theta[t] = last_theta

    def gen_z(theta):
        z = np.repeat(np.arange(K),
            multinomial(nwords_per_doc, theta, size=1)[0])
        return z 

    z = np.apply_along_axis(gen_z, 1, theta)

    def gen_w(z):
        return np.random.multinomial(1, phi[z]).nonzero()[0][0]

    w = np.vectorize(gen_w)(z)

    return w, z, theta, switch
Exemplo n.º 11
Arquivo: tm.py Projeto: sbos/twtm
    def theta_t(th, n, p):
        pt = pt_t(th, n, p)
        if binomial(1, pt) == 1:
            return (th, pt, np.log(pt))

        tt = dirichlet(alpha + n, 1)[0]
        return (tt, pt, np.log(1-pt) + dir_logpdf(tt, alpha + n))
Exemplo n.º 12
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100,
                     alpha=None, noise=-1, plsi=False):
    p = Poisson(ratings_per_user)
    ratings = [[rint(1,5) for i in range(num_items)] for i in range(num_types)]
    if alpha == None:
        alpha = [1]*num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
            type_dist = dirichlet(alpha)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                type = sample(type_dist)
    user_ratings = user_indices, user_ratings
    return user_ratings, ratings, type_dists
Exemplo n.º 13
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    Nonconjugate split-merge.

    M = 10  # number of auxiliary samples

    D, V = N_DV.shape

    T = D + M - 1  # maximum number of topics

    N_D = N_DV.sum(1)  # document lengths

    phi_TV = zeros((T, V))  # topic parameters

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d)  # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    # intialize topic parameters (necessary for Metropolis-Hastings only)

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for itn in xrange(num_itns):

        for _ in xrange(3):
                V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6

            V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print "Itn. %d" % (itn + 1)
            print "%d topics" % len(active_topics)
            print "VI: %f bits (%f bits max.)" % (v, log2(D))

            if v < 1e-6:

    return phi_TV, z_D
Exemplo n.º 14
def get_fake_data(k, N):
    k : number of categories and alpha parameters
    N : number of proportions in the training set
    true_alphas = array([10, 5, 1, 1, 1, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2])
    D = dirichlet(true_alphas, N)   # training set
    return D
Exemplo n.º 15
Arquivo: app.py Projeto: atran/parkov
def generateMarkovChain(sample_names):
  chain = {}
  for sample in sample_names:
    sample_probs = []
    chain[sample] = sample_probs
    random_distribution = dirichlet([1] * len(sample_names))
    for i, sample_child in enumerate(sample_names):
      song_prob = (sample_child, random_distribution[i])
  return chain
Exemplo n.º 16
def get_dists(dim, num):
    Returns an array of discrete distributions.


    dim -- dimensionality of the distributions
    num -- number of distributions to return

    return dirichlet(ones(dim), num)
Exemplo n.º 17
def generate(num_seq, seq_length, alphabet, m_word_length, m_word_param, background_param):
	magic_thetas = [dirichlet(m_word_param) for j in range(m_word_length)]
	background_theta = dirichlet(background_param)
	sequences = []
	starts = []
	for k in range(num_seq):
		background_onehots = [multinomial(1, background_theta) for x in range(seq_length - m_word_length)]
		background = [alphabet[t] for t in [i.tolist().index(1) for i in background_onehots]]
		#background = [alphabet[t].lower() for t in [i.tolist().index(1) for i in background_onehots]]
		magic_onehots = [multinomial(1, theta) for theta in magic_thetas]
		magic_word = [alphabet[j] for j in [i.tolist().index(1) for i in magic_onehots]]
		start_pos = randint(seq_length - m_word_length)
		background[start_pos : start_pos] = magic_word
	#print starts
	ans = []
	return ans
 def randommodel(self, numstates, alphabet):
     I = array(dirichlet([1] * numstates))
     F = array([0.0] * numstates)
     S = []
     # F is treated as an end of string symbol
     for i in range(numstates):
         probs = dirichlet([1] * (alphabet + 1))
         newrow = array(probs[0:alphabet])
         F[i] = probs[alphabet]
     T = []
     for i in range(alphabet):
         for j in range(numstates):
             newrow = array(dirichlet([1] * numstates))
     return (I, F, S, T)
Exemplo n.º 19
 def rand_init_param(self):
     logging.debug('Random param with seed: %s' % os.getpid())
     self.factors = [list() for _ in self.parts]
     # init cluster prior
     for p, prob in enumerate(dirichlet([HardEM.CLUSTER_PRIOR_ALPHA] * len(self.parts))):
     # init other singleton potential factors
     for p in self.parts:
         factors = self.factors[p]
         # factors.append(Binary_FC('isRealName', self.author_graph))
         # factors.append(Norm_FC('revLen', self.author_graph, (3, 7)))
         factors.append(ProdsFC('prProds', self.author_graph, self.author_product_map))
         factors.append(MembsFC('prMembs', self.author_graph))
Exemplo n.º 20
    def sample(self,m):

        Samples m samples from the current Dirichlet distribution.

        :param m: Number of samples to draw.
        :type m: int.
        :returns:  A Data object containing the samples
        :rtype:    natter.DataModule.Data


        return Data(dirichlet(tuple(self.param['alpha']),m).transpose(),str(m) + ' samples from ' + self.name)
Exemplo n.º 21
def generate_corpus(beta, mean, N):
    Returns a corpus of tokens drawn from a Dirichlet--multinomial
    unigram language model. Each token is an instance of one of V
    unique word types, represented by indices 0, ..., V - 1.


    beta -- concentration parameter for the Dirichlet prior
    mean -- V-dimensional mean of the Dirichlet prior
    N -- number of tokens to generate

    return sample(dirichlet(beta * array(mean), 1), N)
Exemplo n.º 22
def comp_fractions(counts, method="dirichlet", **kwargs):
    Covert counts to fraction using given method.
    method : string {dirichlet (default) | normalize | pseudo}
        dirichlet - randomly draw from the corresponding posterior 
                    Dirichlet distribution with a uniform prior.
                    That is, for a vector of counts C, 
                    draw the fractions from Dirichlet(C+1). 
        normalize - simply divide each row by its sum.
        pseudo    - add given pseudo count (defualt 1) to each count and
                    do simple normalization.

    KW Arguments
    p_counts : int/float (default 1)
        The value of the pseudo counts to add to all counts.
        Used only if method is dirichlet

    fracs: CompData
        Component fractions as a compositional data object.
    from Compositions import CompData

    n, m = np.shape(counts)
    if method == "dirichlet":
        from numpy.random.mtrand import dirichlet

        fracs = CompData(np.ones((n, m)))
        method = method.lower()
        for i in xrange(n):  # for each sample
            C = counts[i, :]  # counts of each otu in sample
            a = C + 1  # dirichlet parameters
            fracs[i, :] = dirichlet(a)
    elif method == "normalize":
        temp = counts.T
        fracs = CompData((temp / temp.sum()).T)
    elif method is "pseudo":
        p_counts = kwargs.pop("p_counts", 1)
        fracs = comp_fractions(counts + p_counts, method="normalize")
        raise ValueError, 'Unsupported method "%s"' % method
    return fracs
Exemplo n.º 23
Arquivo: misc.py Projeto: neerajg/sdap
Exemplo n.º 24
 def copy(self, t, models, parents):
     In the exemplar based model, individuals copy the trait of other
     individuals on the basis of the extrinsic properties of those individuals. 
     With a probability of C, individuals will be biased towards copying from 
     such individuals; the rest (1 - C) copies unbiased.
     if t > 0:
         biased_parents = self.rnd.choice(self.N, size=self.N, p=dirichlet([self.alpha] * self.N))
         biased = self.rnd.rand(self.N) < self.C
         biased_parents[~biased] = parents[~biased]
         self.population = self.population[biased_parents]
         self.parents[t] = biased_parents
         self.population = models
         self.parents[t] = parents
Exemplo n.º 25
    def __init__(self, params):
        # The word distribution of this node's topic.
        self.word_dist = dirichlet(params["topic_to_word_param"])
        self.word_cdf = util.get_cdf(self.word_dist)

        # The number of documents that pass through this node.
        self.num_documents = 0

        # Those children of this node which have looked below this level.
        # Documents that reached this node but never looked below aren't
        # represented here; this is okay because the Chinese Restaurant
        # Process is exchangeable (doesn't depend on order).
        self.children = []

        # The number of documents which looked below this level.  This
        # should always be equal to sum(c.num_documents for c in
        # self.children).
        self.num_documents_in_children = 0
Exemplo n.º 26
def generate_data(V, D, l, alpha, beta):
    Generates a synthetic corpus of documents from a Dirichlet process
    mixture model with multinomial mixture components (topics). The
    mixture components are drawn from a symmetric Dirichlet prior.


    V -- vocabulary size
    D -- number of documents
    l -- average document length
    alpha -- concentration parameter for the Dirichlet process
    beta -- concentration parameter for the symmetric Dirichlet prior

    T = D # maximum number of topics

    phi_TV = zeros((T, V))
    z_D = zeros(D, dtype=int)
    N_DV = zeros((D, V), dtype=int)

    for d in xrange(D):

        # draw a topic assignment for this document

        dist = bincount(z_D).astype(float)
        dist[0] = alpha
        [t] = sample(dist)
        t = len(dist) if t == 0 else t
        z_D[d] = t

        # if it's a new topic, draw the parameters for that topic

        if t == len(dist):
            phi_TV[t - 1, :] = dirichlet(beta * ones(V) / V)

        # draw the tokens from the topic

        for v in sample(phi_TV[t - 1, :], num_samples=poisson(l)):
            N_DV[d, v] += 1

    z_D = z_D - 1

    return phi_TV, z_D, N_DV
Exemplo n.º 27
Arquivo: tm.py Projeto: sbos/twtm
def E_step(w, phi, alpha, beta, p0, L, n=None, theta=None, maxiter=100, resampling=True, smoothing=False):
    T, N = w.shape
    K, V = phi.shape

    log_theta = None

    if n == None:
        n = dirichlet(alpha, size=T) * N
    if theta == None:
        theta, pt, log_theta = q_theta(n, alpha, p0, L)
    # if theta == None:
    #     theta = dirichlet(alpha, size=T)
    #     log_theta = np.log(theta)
    # if n == None:
    #     n = theta * N

    log_phi = np.log(phi)

    pt = None
    likelihood_log = np.zeros(maxiter, dtype=float)
    theta_log = np.zeros((maxiter, T, K), dtype=float)

    for iteration in xrange(maxiter):
        z = q_z(w, log_theta, phi)
        #z = q_z_alt(w, theta, phi)
        n = z.sum(axis=1)
        new_theta, pt, new_log_theta = q_theta(n, alpha, p0, L, resampling=resampling,

        diff = np.abs(theta - new_theta)
        avg_diff = diff.mean()
        max_diff = diff.max()

        likelihood_log[iteration] = likelihood(w, z, np.log(new_theta), log_phi)
        print 'iteration %d. avg diff: %f. max diff: %f. likelihood: %f' %\
         (iteration, avg_diff, max_diff, likelihood_log[iteration])

        theta_log[iteration] = theta
        log_theta = new_log_theta
        theta = new_theta

    return z, theta, pt, likelihood_log, theta_log
Exemplo n.º 28
def computePrior(options):
    num = options.num
    clusterAlpha = options.clusterAlpha
    balanced = options.balanced
    numClusters = options.numClusters

    assert (num > 0)

    if numClusters is None:
        numClusters = max(int(log(num, 1.5)), 2)

    print(sys.stderr, numClusters, "clusters")

    clusterPrior = [clusterAlpha] * numClusters
    if not balanced:
        clusterPrior = dirichlet(clusterPrior)
        norm = sum(clusterPrior)
        clusterPrior = [xx / norm for xx in clusterPrior]

    return clusterPrior
Exemplo n.º 29
Exemplo n.º 30
def prob5(data, iters=5000):
    post = []
    post_temp = namedtuple('post', 'style, abv, post_prob')
    for label, vals in data.items():
        counts = Counter(vals)
        freq_data = [(i, j) for i, j in counts.items()]
        keys = [i[0] for i in freq_data]
        obs = [i[1] for i in freq_data]
        results = []
        for n in xrange(iters):
            samp = dirichlet([x for x in obs])
        results = np.array(results)
        probs = np.mean(results, axis=0)
        n_data = zip(keys, probs)
        for key, prob in n_data:
            if prob is not None and key == 5:
                datum = post_temp(style=label[1], abv=label[0], post_prob=prob)

    return post
Exemplo n.º 31
 def to_fractions(self, method='dirichlet', **kwargs):
     Convert counts to fraction, either by simple normalization or adding pseudo counts, or dirichlet sampling.
     If dirichlet sampling is used, for each sample (col) fit a dirichlet distribution and sample the fraction from it.
     The prior is a uniform dirichlet (a = ones(len(otus)) )
     Return a new instance.
     if method is 'normalize': fracs = self.normalize()
     elif method is 'pseudo':
         p_counts = kwargs.get('p_counts', 1)
         fracs = (self + p_counts).normalize()
         from numpy.random.mtrand import dirichlet
         mat, row_labels, col_labels = self.to_matrix()
         for i in range(len(col_labels)):  # for each sample
             N = mat[:, i]  # counts of each otu in sample
             a = N + 1  # dirichlet parameters
             f = dirichlet(a)  # fractions are random sample from dirichlet
             mat[:, i] = f
         fracs = self.remove_rows(self.row_labels())
         fracs.from_matrix(mat, row_labels, col_labels)
     return fracs
Exemplo n.º 32
def generate_data(T, K, beta, n=None):
    T is the number of timesteps
    K is the dimensionality
    beta is the concentration parameter
    n is the base measure

    if n == None:
        n = ones(K) / K  # uniform base measure

    x = zeros(T)  # observations
    r = zeros(T)  # run lengths

    C = []  # changepoints

    for t in range(1, T + 1):

        # sample run length

        if t == 1:
            r[t - 1] = 0
            if uniform() < hazard([r[t - 2] + 1]):
                r[t - 1] = 0
                r[t - 1] = r[t - 2] + 1

        # sample new parameters if run length is zero

        if r[t - 1] == 0:
            [phi] = dirichlet(beta * n, 1)

        x[t - 1] = categorical(phi)  # sample data

    return x, r, C
Exemplo n.º 34
def generate_ratings(num_types,
    p = Poisson(ratings_per_user)
    ratings = [[rint(1, 5) for i in range(num_items)]
               for i in range(num_types)]
    if alpha == None:
        alpha = [1] * num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
            type_dist = dirichlet(alpha)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                rating.append(rint(1, 5))
                type = sample(type_dist)
    user_ratings = user_indices, user_ratings

    return user_ratings, ratings, type_dists
Exemplo n.º 35
 def rand_init(self):
     p = np.clip(dirichlet([1] * 2)[0], EPS, 1 - EPS)
     self.log_val = np.log(p)
     self.log_1_val = np.log(1 - p)
Exemplo n.º 36
 def rand_init(self):
     self.log_pr_prod = np.log(
         dirichlet([HardEM.PROD_PRIOR_ALPHA] *
                   self.n_all_membs))  # near uniform initialization
Exemplo n.º 37
                count += 1
                idx = top20k.index(w)
                if idx in numwords:
                    numwords[idx] += 1
                    numwords[idx] = 1
    return (header, numwords, count)

result = lines.map(countWords)

alpha = [0.1] * 20
beta = np.array([0.1] * 20000)

pi = dirichlet(alpha).tolist()  # *** vector gives prevalence of each category
mu = np.array([
    dirichlet(beta) for j in range(20)
])  # *** prob vector prevelence of each word of category in each doc
log_mu = np.log(mu)
header = result.map(lambda x: x[0]).collect()
x = result.map(lambda x: x[1]).map(
    map_to_array).cache()  # *** Num of occurance of each word in each doc.

# getProbs accepts four parameters:
# checkParams: set to true if you want a check on all of the params
#   that makes sure that everything looks OK. This will make the
#   function run slower; use only for debugging
Exemplo n.º 38
def sample_post(hp, ss):
    return dirichlet(ss.counts + hp.alphas)
Exemplo n.º 39
def gibbs_sep_doc(alpha,Pi,A,word_doc,z_doc,doc_list,\

    no_of_topics = A.shape[1] 
    K = alpha.shape[0] # No of mixture components
    multi_rand = np.random.multinomial(1,[1/no_of_topics]*no_of_topics,\
    z_init = np.array([mat.argmax() for mat in multi_rand])
    theta = np.ones([no_of_topics])*(1/no_of_topics)

    numdocs = doc_list.shape[0]
    p_M = np.zeros([K])
    E_theta = np.zeros([no_of_topics])
    E_m_d_theta = np.zeros([K,no_of_topics])
    z_count = np.zeros([vocabSize,no_of_topics])
    # iterating through every document  
    idx = 0
    for doc_index in doc_list:
        word_indices = word_doc[idx]
        z_d = z_doc[idx]
        for i in range(no_of_itr + X):    

            # Sampling the mixture component
            p_theta_gm = np.empty(K)
            for k in range(K):
                p_theta_gm[k] = dirichlet_log_prob(theta,alpha[k])
            log_p_md = log_np_array(Pi) + p_theta_gm
            norm_log_pmd = (log_p_md - compute_log_sum(log_p_md))
            p_md = np.array([math.exp(m) for m in norm_log_pmd])
            p_md = p_md/p_md.sum()
            M = np.random.multinomial(1,p_md,size=1).argmax()

            # Sampling theta
            alpha_d = alpha[M]
            topics_count = count_topics(z_d, word_indices, no_of_topics)
            alpha_p = alpha_d + topics_count
            local_theta = dirichlet(alpha_p, size=1)
            local_theta = np.array([remove_zero(m) for m in local_theta[0]])
            theta = local_theta/local_theta.sum()
            w_count = 0
            for w_index in word_indices:
                # iterating through every word in document to sample z
                p_zd = A[w_index] * theta
                p_zd = p_zd/p_zd.sum()
                word_topic = np.random.multinomial(1,p_zd,size=1).argmax()
                z_d[w_count] = word_topic
                w_count = w_count + 1 

                if (i >= X):                  
                    z_count[w_index, word_topic] += 1 
            # Saving only theta and M samples from every iteration 
            # only burning-in
            if (i >= X):
                p_M[M] += 1 
                E_theta += theta 
                E_m_d_theta[M] += log(theta)

        idx += 1

        # Communicating the array's back to the parent process
        z_count_all[doc_index] = z_count
        p_md_all[doc_index] = p_M/no_of_itr
        E_theta_all[doc_index] = E_theta/no_of_itr
        E_m_d_theta_all[doc_index] = E_m_d_theta/no_of_itr
Exemplo n.º 40
def iteration(V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, num_inner_itns):
    Performs a single iteration of Metropolis-Hastings (split-merge).

    phi_s_V = empty(V)
    phi_t_V = empty(V)
    phi_merge_t_V = empty(V)

    N_s_V = empty(V, dtype=int)
    N_t_V = empty(V, dtype=int)
    N_merge_t_V = empty(V, dtype=int)

    log_dist = empty(2)

    d, e = choice(D, 2, replace=False) # choose 2 documents

    if z_D[d] == z_D[e]:
        s = inactive_topics.pop()
        s = z_D[d]

    inv_z_s = set([d])
    N_s_V[:] = N_DV[d, :]
    N_s = N_D[d]
    D_s = 1

    t = z_D[e]
    inv_z_t = set([e])
    N_t_V[:] = N_DV[e, :]
    N_t = N_D[e]
    D_t = 1

    inv_z_merge_t = set([d, e])
    N_merge_t_V[:] = N_DV[d, :] + N_DV[e, :]
    N_merge_t = N_D[d] + N_D[e]
    D_merge_t = 2

    if z_D[d] == z_D[e]:
        idx = inv_z_T[t] - set([d, e])
        idx = (inv_z_T[s] | inv_z_T[t]) - set([d, e])

    for f in idx:
        if uniform() < 0.5:
            N_s_V += N_DV[f, :]
            N_s += N_D[f]
            D_s += 1
            N_t_V += N_DV[f, :]
            N_t += N_D[f]
            D_t += 1

        N_merge_t_V += N_DV[f, :]
        N_merge_t += N_D[f]
        D_merge_t += 1

    if z_D[d] == z_D[e]:
        phi_merge_t_V[:] = phi_TV[t, :]
        phi_merge_t_V = dirichlet(N_merge_t_V + beta / V)

    acc = 0.0

    for inner_itn in xrange(num_inner_itns):

        # sample new parameters for topics s and t ... but if it's the
        # last iteration and we're doing a merge, then just set the
        # parameters back to phi_TV[s, :] and phi_TV[t, :]

        if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]:
            phi_s_V[:] = phi_TV[s, :]
            phi_t_V[:] = phi_TV[t, :]
            phi_s_V = dirichlet(N_s_V + beta / V)
            phi_t_V = dirichlet(N_t_V + beta / V)

        if inner_itn == num_inner_itns - 1:

            acc += gammaln(N_s + beta)
            acc -= gammaln(N_s_V + beta / V).sum()
            acc += ((N_s_V + beta / V - 1) * log(phi_s_V)).sum()

            acc += gammaln(N_t + beta)
            acc -= gammaln(N_t_V + beta / V).sum()
            acc += ((N_t_V + beta / V - 1) * log(phi_t_V)).sum()

            acc -= gammaln(N_merge_t + beta)
            acc += gammaln(N_merge_t_V + beta / V).sum()
            acc -= ((N_merge_t_V + beta / V - 1) *

        for f in idx:

            # (fake) restricted Gibbs sampling scan

            if f in inv_z_s:
                N_s_V -= N_DV[f, :]
                N_s -= N_D[f]
                D_s -= 1
                N_t_V -= N_DV[f, :]
                N_t -= N_D[f]
                D_t -= 1

            log_dist[0] = log(D_s)
            log_dist[0] += (N_DV[f, :] * log(phi_s_V)).sum()

            log_dist[1] = log(D_t)
            log_dist[1] += (N_DV[f, :] * log(phi_t_V)).sum()

            log_dist -= log_sum_exp(log_dist)

            if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]:
                u = 0 if z_D[f] == s else 1
                [u] = log_sample(log_dist)

            if u == 0:
                N_s_V += N_DV[f, :]
                N_s += N_D[f]
                D_s += 1
                N_t_V += N_DV[f, :]
                N_t += N_D[f]
                D_t += 1

            if inner_itn == num_inner_itns - 1:
                acc += log_dist[u]

    if z_D[d] == z_D[e]:

        acc *= -1.0

        acc += log(alpha)
        acc += gammaln(D_s) + gammaln(D_t) - gammaln(D_T[t])
        tmp = beta / V
        acc += gammaln(beta) - V * gammaln(tmp)
        acc += (tmp - 1) * (log(phi_s_V).sum() + log(phi_t_V).sum())
        acc -= (tmp - 1) * log(phi_TV[t, :]).sum()

        acc += (N_s_V * log(phi_s_V)).sum() + (N_t_V * log(phi_t_V)).sum()
        acc -= (N_TV[t, :] * log(phi_TV[t, :])).sum()

        if log(uniform()) < min(0.0, acc):
            phi_TV[s, :] = phi_s_V
            phi_TV[t, :] = phi_t_V
            z_D[list(inv_z_s)] = s
            z_D[list(inv_z_t)] = t
            inv_z_T[s] = inv_z_s
            inv_z_T[t] = inv_z_t
            N_TV[s, :] = N_s_V
            N_TV[t, :] = N_t_V
            N_T[s] = N_s
            N_T[t] = N_t
            D_T[s] = D_s
            D_T[t] = D_t


        acc -= log(alpha)
        acc += gammaln(D_merge_t) - gammaln(D_T[s]) - gammaln(D_T[t])
        tmp = beta / V
        acc += V * gammaln(tmp) - gammaln(beta)
        acc += (tmp - 1) * log(phi_merge_t_V).sum()
        acc -= (tmp - 1) * (log(phi_TV[s, :]).sum() + log(phi_TV[t, :]).sum())

        acc += (N_merge_t_V * log(phi_merge_t_V)).sum()
        acc -= ((N_TV[s, :] * log(phi_TV[s, :])).sum() +
                (N_TV[t, :] * log(phi_TV[t, :])).sum())

        if log(uniform()) < min(0.0, acc):
            phi_TV[s, :] = zeros(V)
            phi_TV[t, :] = phi_merge_t_V
            z_D[list(inv_z_merge_t)] = t
            inv_z_T[t] = inv_z_merge_t
            N_TV[s, :] = zeros(V, dtype=int)
            N_TV[t, :] = N_merge_t_V
            N_T[s] = 0
            N_T[t] = N_merge_t
            D_T[s] = 0
            D_T[t] = D_merge_t
Exemplo n.º 41
def generate_docs(num_topics,
    """Generates documents according to plsi or lda
            the number of underlying latent topics
            the number of documents to generate
            parameter to a Poisson distribution;
            determines the average words in a documents
            the number of words in the vocabulary
        Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
            parameter to dirichlet distribution for topics
            parameter to dirichlet distribution for words
            given as a probability; each word will be replaced with a random
            word with noise probability
            flag to determine which distribution to draw from,
            a random distribution or a sample from a dirichlet distribution
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
            the distribution over topics for each document;
            each row is the distribution for a different document
    p = Poisson(words_per_doc)

    alpha = [alpha] * num_topics
    beta = [beta] * num_topics

    if plsi:
        word_dist = [
            normalize([rand() for w in range(vocab_size)])
            for t in range(num_topics)
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:

    topic_cdfs = []
    docs = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        words_per_doc = p.sample()
        doc = []
        if plsi:
            topic_dist = normalize([rand() for t in range(num_topics)])
            topic_dist = dirichlet(alpha)
        topic_cdf = get_cdf(topic_dist)
        for word in range(words_per_doc):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
                topic = sample(topic_cdf)
        doc_index += 1
    return docs, word_dist, topic_dists
Exemplo n.º 42
 def dir_fun(x):
     a = x+p_counts
     f = dirichlet(a)
     return f
Exemplo n.º 43
def _sample_post(hp, ss):
    values = (hp.betas * hp.alpha).tolist()
    for i, count in ss.counts.iteritems():
        values[i] += count
    values.append(hp.beta0 * hp.alpha)
    return dirichlet(values)
Exemplo n.º 44
        write_proto("%s-%i.index" % (filename, div), c)

if __name__ == "__main__":

    beta = {}
    eta = zeros(flags.num_topics)
    vocab_total = defaultdict(int)
    for ii in xrange(flags.num_topics):
        eta[ii] = ii + random() * float(ii)
        for ll in xrange(flags.num_langs):
            print ml_vocab[ii]
            print ml_vocab[ii][ll]
            gamma = [flags.gamma] * len(ml_vocab[ii][ll])
            beta[(ll, ii)] = dirichlet(gamma)
            print "BETA", (ll, ii), beta[(ll, ii)]

    theta = {}
    alpha = [flags.alpha / float(flags.num_topics)] * flags.num_topics

    docs = defaultdict(Doc)

    print "Variance", flags.variance, flags.variance > 0

    for ll in xrange(flags.num_langs):
        for ii in [(ll, x) for x in xrange(flags.num_docs)]:
            z_bar = zeros(flags.num_topics)
            theta[ii] = DirichletDraw(alpha)
            docs[ii].lang = ll
            docs[ii].theta = theta[ii]
    i = 0
    while i < 20000:
        if i in mapping:
            count_lst[i] = mapping[i]
        i += 1
    return np.array(count_lst)

# calculate term frequency vector for each document
result = lines.map(countWords)

alpha = [0.1] * 20
beta = np.array([0.1] * 20000)

pi = dirichlet(alpha).tolist()
mu = np.array([dirichlet(beta) for j in range(20)])
log_mu = np.log(mu)
header = result.map(lambda x: x[0]).collect()
n = result.count()
l = result.map(lambda x: x[2]).collect()
x = result.map(lambda x: x[1]).map(map_to_array).cache()

label = result.map(lambda x: (x[0], x[1])).map(
    lambda x: ((x[0][x[0].index('id="') + 18:x[0].index('url')]), x[1])).map(
        lambda x: (x[0][:x[0].index("/")], map_to_array(x[1]))).cache()

labels = np.array(label.map(lambda x: x[0]).distinct().collect())

def getProbs(checkParams, log_allMus, x, log_pi):
Exemplo n.º 46

        matrix = []

        for glosa in querys:
            tok = token(glosa)
            arrQuery = [0]*len(bag_of_word)
            for j  in tok:
                arrQuery[bag_of_word.index( j[0] )] = j[1]
        # random matrix
        Matrix_U = []
        for i in range(0,len(matrix)):
            Ui = dirichlet([1] * 3)
        c = FuzzyCMeans(matrix, Matrix_U)
        inf = 0
        nav = 0
        res = 0
        categoryArray = []
        for val in c.mu:
            cat = min_val(val[0], val[1], val[2])
            if cat == 'INF':
                inf += 1
            elif cat == 'NAV':
                nav +=1
Exemplo n.º 47
def DirichletDraw(alpha):
    t = dirichlet(alpha)
    while isnan(t[0]) or isinf(t[0]):
        t = dirichlet(alpha)
    return t