예제 #1
0
def generate_corpus(alpha, m, beta, n, D, Nd):
    """
    Returns a grouped corpus drawn from a mixture of
    Dirichlet--multinomial unigram language models.

    Arguments:

    alpha -- concentration parameter for the Dirichlet prior over theta
    m -- T-dimensional mean of the Dirichlet prior over theta
    beta -- concentration parameter for the Dirichlet prior over phis
    n -- V-dimensional mean of the Dirichlet prior over phis
    D -- number of documents to generate
    Nd -- number of tokens to generate per document
    """

    corpus = GroupedCorpus()

    theta = dirichlet(alpha * array(m), 1)
    phis = dirichlet(beta * array(n), len(m))

    for d in xrange(D):
        [t] = sample(theta, 1)
        corpus.add(str(d), str(t), [str(v) for v in sample(phis[t, :], Nd)])

    return corpus
예제 #2
0
파일: misc.py 프로젝트: neerajg/sdap
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J):
    # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected
    # (update r log exp problem)
    alphas = [random(K,),random(L,)]
    alphas[0] = alphas[0]/np.sum(alphas[0])
    alphas[1] = alphas[1]/np.sum(alphas[1])
    gammas = [randint(low = 50, high = 500, size = (M,K)) + random((M,K)), randint(low = 1.46, high = 3, size = (N,L)) + random((N,L))]
    beta_shape = (K,L,1 + X1.shape[1] + X2.shape[1])
    sigmaY_shape = (K,L)
    #randint(low = -1, high = 1, size = beta_shape) + 
    betas = [random(beta_shape), randint(low = 10, high = 50, size = sigmaY_shape) + random(sigmaY_shape)]  
    
    r1 = dirichlet(alphas[0], no_obs)
    r1[r1<1e-4] = 1e-4
    #r1[r1>0.99] = 0.9
    r2 = dirichlet(alphas[1], no_obs)
    r2[r2<1e-6] = 1e-6
    #r2[r2>0.9] = 0.9    
    r = [r1,r2]
    ones = np.ones((len(train_I),))
    mu = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(1)
    mv = sp.csr_matrix((ones, (train_I,train_J)), shape=(M,N)).sum(0)
    mu[mu<1] = 1
    mv[mv<1] = 1
    
    for k in range(K):
        gammas[0][:,k] = alphas[0][k] + np.array(np.divide(sp.csr_matrix((r1[:,k],(train_I,train_J)),shape=(M,N)).sum(1),mu).flatten())[0] # M x K
    for l in range(L):    
        gammas[1][:,l] = alphas[1][l] + np.array(np.divide(sp.csr_matrix((r2[:,l],(train_I,train_J)),shape=(M,N)).sum(0),mv).transpose().flatten())[0] # N x L
                  
    return alphas, gammas, betas, r
예제 #3
0
def generate_corpus(alpha, m, beta, n, D, Nd):
    """
    Returns a grouped corpus drawn from a mixture of
    Dirichlet--multinomial unigram language models.

    Arguments:

    alpha -- concentration parameter for the Dirichlet prior over theta
    m -- T-dimensional mean of the Dirichlet prior over theta
    beta -- concentration parameter for the Dirichlet prior over phis
    n -- V-dimensional mean of the Dirichlet prior over phis
    D -- number of documents to generate
    Nd -- number of tokens to generate per document
    """
    
    T = len(m)
    V = len(n)
    corpus = GroupedCorpus()
    theta = dirichlet(alpha*m)
    zVector = sample(theta,D)   #generate group type Zd for each docutment d
    allPhis = [dirichlet(beta*n) for t in xrange(T)]
    allWords = zeros(Nd*D)

    for d in xrange(D):
        for n in xrange(Nd):
            allWords[d*Nd+n] = sample(allPhis[zVector[d]])
    print allWords
    return allWords,zVector
예제 #4
0
def iteration(V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T):
    """
    Performs a single iteration of Radford Neal's Algorithm 8.
    """

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for d in xrange(D):

        old_t = z_D[d]

        if inv_z_T is not None:
            inv_z_T[old_t].remove(d)

        N_TV[old_t, :] -= N_DV[d, :]
        N_T[old_t] -= N_D[d]
        D_T[old_t] -= 1

        seterr(divide='ignore')
        log_dist = log(D_T)
        seterr(divide='warn')

        idx = -1 * ones(M, dtype=int)
        idx[0] = old_t if D_T[old_t] == 0 else inactive_topics.pop()
        for m in xrange(1, M):
            idx[m] = inactive_topics.pop()
        active_topics |= set(idx)
        log_dist[idx] = log(alpha) - log(M)

        if idx[0] == old_t:
            phi_TV[idx[1:], :] = dirichlet(beta * ones(V) / V, M - 1)
        else:
            phi_TV[idx, :] = dirichlet(beta * ones(V) / V, M)

        for t in active_topics:
            log_dist[t] += (N_DV[d, :] * log(phi_TV[t, :])).sum()

        [t] = log_sample(log_dist)

        z_D[d] = t

        if inv_z_T is not None:
            inv_z_T[t].add(d)

        N_TV[t, :] += N_DV[d, :]
        N_T[t] += N_D[d]
        D_T[t] += 1

        idx = set(idx)
        idx.discard(t)
        active_topics -= idx
        inactive_topics |= idx
예제 #5
0
def make_artificial_data(M,N,D1,D2,K,L,no_obs):
    X1 = random((M,D1))
    X2 = random((N,D2))
    Xs = [X1,X2]
    
    beta = randint(low = 0, high = 10, size = (K,L,1 + X1.shape[1] + X2.shape[1])) + random((K,L,1 + X1.shape[1] + X2.shape[1]))
    sigmaY = randint(low = 0, high = 10, size = (K,L)) + random((K,L))
    
    alphas = [randint(low = 0, high = 10000, size = (K,)) + random(K,), randint(low = 0, high = 1000, size = (L,)) + random(L,)]
    alpha1 = alphas[0]
    alpha2 = alphas[1]
    pi1 = dirichlet(alpha1,(M,1))
    pi2 = dirichlet(alpha2,(N,1))
    z1 = sample_discrete(pi1,(M,1))
    z2 = sample_discrete(pi2,(N,1))
   
    made_ij = False
    I = []
    J = []
    prev_len = 0
    while made_ij == False:  
        I.extend(randint(low = 0,high = M, size = (no_obs - prev_len,)))
        J.extend(randint(low = 0,high = N, size = (no_obs - prev_len,)))
        W = sp.csr_matrix((np.ones(no_obs),(I,J)), shape=(M,N))
        I,J = sp.find(W)[:2]
        I = list(I)
        J = list(J)
        if len(I) == no_obs:
            made_ij = True
        else:
            prev_len = len(I)
    
    Xbias = np.ones((no_obs,1)) # |Yobs| x 1
    Xusers = X1[I,:].reshape((no_obs,D1)) # |Yobs| x D1
    Xitems = X2[J,:].reshape((no_obs,D2)) # |Yobs| x D2
    X = np.hstack((Xbias, Xusers, Xitems)) # |Yobs| x (1 + D1 + D2)    
    Y = np.zeros((no_obs,)) # |Yobs| x 1
    for o in range(no_obs):
        Y[o] = sigmaY[int(z1[I[o]][0]),int(z2[J[o]][0])] * randn() + np.dot(beta[int(z1[I[o]][0]),int(z2[J[o]][0]),:],X[o,:])
    
    pis = [pi1,pi2]
    zs = [z1,z2]
    betas = [beta,sigmaY]
 
    params = {'alphas':alphas, 
              'pis':pis, 
              'zs':zs, 
              'betas':betas
              }
        
    return Xs, Y, I, J, params
예제 #6
0
파일: misc.py 프로젝트: neerajg/sdap
def init_params(K, L, M, N, X1, X2, no_obs, train_I, train_J):
    # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected
    # (update r log exp problem)
    alphas = [random(K), random(L)]
    alphas[0] = alphas[0] / np.sum(alphas[0])
    alphas[1] = alphas[1] / np.sum(alphas[1])
    gammas = [
        randint(low=50, high=500, size=(M, K)) + random((M, K)),
        randint(low=1.46, high=3, size=(N, L)) + random((N, L)),
    ]
    beta_shape = (K, L, 1 + X1.shape[1] + X2.shape[1])
    sigmaY_shape = (K, L)
    # randint(low = -1, high = 1, size = beta_shape) +
    betas = [random(beta_shape), randint(low=10, high=50, size=sigmaY_shape) + random(sigmaY_shape)]

    m1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1]))
    m2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1]))
    sigma1 = np.zeros((K, X1.shape[1])) + random((K, X1.shape[1]))
    sigma2 = np.zeros((L, X2.shape[1])) + random((L, X2.shape[1]))
    theta1 = [m1, sigma1]
    theta2 = [m2, sigma2]
    thetas = [theta1, theta2]

    r1 = dirichlet(alphas[0], M)
    r1[r1 < 1e-4] = 1e-4
    # r1[r1>0.99] = 0.9
    r2 = dirichlet(alphas[1], N)
    r2[r2 < 1e-6] = 1e-6
    # r2[r2>0.9] = 0.9
    r = [r1, r2]
    ones = np.ones((len(train_I),))
    mu = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(1)
    mv = sp.csr_matrix((ones, (train_I, train_J)), shape=(M, N)).sum(0).transpose()
    mu[mu < 1] = 1
    mv[mv < 1] = 1

    s1 = dirichlet(alphas[0], M)
    s1[s1 < 1e-4] = 1e-4
    # r1[r1>0.99] = 0.9
    s2 = dirichlet(alphas[1], N)
    s2[s2 < 1e-6] = 1e-6
    # r2[r2>0.9] = 0.9
    s = [s1, s2]

    gammas[0] = np.tile(alphas[0].reshape(1, K), (M, 1)) + s[0] + np.multiply(r[0], mu)  # M x K
    gammas[1] = np.tile(alphas[1].reshape(1, L), (N, 1)) + s[1] + np.multiply(r[1], mv)  # N x L

    return alphas, gammas, betas, thetas, r, s
예제 #7
0
파일: mmsbm.py 프로젝트: hannawallach/mmsbm
def generate_mmsbm_data(N, K, alpha, a, b, m=None):
    """
    N is the number of nodes
    K is the number of blocks
    alpha is the concentration parameter
    a and b are the shape parameters
    m is the base measure
    """

    if m == None:
        m = ones(K) / K # uniform base measure

    Y = zeros((N, N), dtype=int) # edges

    # sample node-specific distributions over blocks

    [theta] = dirichlet(alpha * m, (1, N))

    # sample between- and within-block edge probabilities

    phi = beta(a, b, (K, K))

    # sample block assignments and edges

    for i in range(1, N+1):
        for j in range(1, N+1):
            idx = (categorical(theta[i-1,:]), categorical(theta[j-1,:]))
            Y[i-1,j-1] = uniform() <= phi[idx]

    return theta, Y
예제 #8
0
def sample_dirichlet_from_dict(dt):
    '''
    Sample one set or dirichlet distribution for given dictionary
    '''
    alphas = dt.values()
    raw_dist = dirichlet(alphas)
    return dict( zip((dt.keys()), (raw_dist)) )
예제 #9
0
파일: mmsbm.py 프로젝트: hannawallach/mmsbm
def generate_sbm_data(N, K, alpha, a, b, m=None):
    """
    N is the number of nodes
    K is the number of blocks
    alpha is the concentration parameter
    a and b are the shape parameters
    m is the base measure
    """

    if m == None:
        m = ones(K) / K # uniform base measure

    Z = zeros((N, K)) # block assignments

    # sample (global) distribution over blocks

    [theta] = dirichlet(alpha * m, 1)

    # sample between- and within-block edge probabilities

    phi = beta(a, b, (K, K))

    # sample block assignments

    for n in range(1, N+1):
        Z[n-1,:] = multinomial(1, theta)

    # sample edges

    Y = (uniform(size=(N, N)) <= dot(dot(Z, phi), Z.T)).astype(int)

    return Z, Y
예제 #10
0
파일: tm.py 프로젝트: sbos/twtm
def generate_docs(phi, ndocs, nwords_per_doc, alpha=0.1, p0=0.8):
    K, V = phi.shape

    theta = np.zeros((ndocs, K), dtype=float)

    switch = np.append([0], binomial(1, p0, ndocs - 1))
    switch = switch == 0

    samples = dirichlet([alpha] * K, size=int(switch.sum()))
    theta[switch] = samples

    last_theta = None
    for t in xrange(0, ndocs):
        if switch[t] == True:
            last_theta = theta[t]
            continue

        theta[t] = last_theta

    def gen_z(theta):
        z = np.repeat(np.arange(K),
            multinomial(nwords_per_doc, theta, size=1)[0])
        np.random.shuffle(z)
        return z 

    z = np.apply_along_axis(gen_z, 1, theta)

    def gen_w(z):
        return np.random.multinomial(1, phi[z]).nonzero()[0][0]

    w = np.vectorize(gen_w)(z)

    return w, z, theta, switch
예제 #11
0
파일: tm.py 프로젝트: sbos/twtm
    def theta_t(th, n, p):
        pt = pt_t(th, n, p)
        if binomial(1, pt) == 1:
            return (th, pt, np.log(pt))

        tt = dirichlet(alpha + n, 1)[0]
        return (tt, pt, np.log(1-pt) + dir_logpdf(tt, alpha + n))
예제 #12
0
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100,
                     alpha=None, noise=-1, plsi=False):
    p = Poisson(ratings_per_user)
    ratings = [[rint(1,5) for i in range(num_items)] for i in range(num_types)]
    if alpha == None:
        alpha = [1]*num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
        else:
            type_dist = dirichlet(alpha)
        type_dists.append(type_dist)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                rating.append(rint(1,5))
            else:
                type = sample(type_dist)
                rating.append(ratings[type][j])
            indices.append(j)
        user_ratings.append(rating)
        user_indices.append(indices)
    user_ratings = user_indices, user_ratings
    
    return user_ratings, ratings, type_dists
예제 #13
0
def inference(N_DV, alpha, beta, z_D, num_itns, true_z_D=None):
    """
    Nonconjugate split-merge.
    """

    M = 10  # number of auxiliary samples

    D, V = N_DV.shape

    T = D + M - 1  # maximum number of topics

    N_D = N_DV.sum(1)  # document lengths

    phi_TV = zeros((T, V))  # topic parameters

    inv_z_T = defaultdict(set)
    for d in xrange(D):
        inv_z_T[z_D[d]].add(d)  # inverse mapping from topics to documents

    active_topics = set(unique(z_D))
    inactive_topics = set(xrange(T)) - active_topics

    N_TV = zeros((T, V), dtype=int)
    N_T = zeros(T, dtype=int)

    for d in xrange(D):
        N_TV[z_D[d], :] += N_DV[d, :]
        N_T[z_D[d]] += N_D[d]

    D_T = bincount(z_D, minlength=T)

    # intialize topic parameters (necessary for Metropolis-Hastings only)

    for t in active_topics:
        phi_TV[t, :] = dirichlet(N_TV[t, :] + beta / V)

    for itn in xrange(num_itns):

        for _ in xrange(3):
            iteration(
                V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, 6
            )

        algorithm_8_iteration(
            V, D, N_DV, N_D, alpha, beta, M, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T
        )

        if true_z_D is not None:

            v = vi(true_z_D, z_D)

            print "Itn. %d" % (itn + 1)
            print "%d topics" % len(active_topics)
            print "VI: %f bits (%f bits max.)" % (v, log2(D))

            if v < 1e-6:
                break

    return phi_TV, z_D
예제 #14
0
def get_fake_data(k, N):
    """
    k : number of categories and alpha parameters
    N : number of proportions in the training set
    """
    true_alphas = array([10, 5, 1, 1, 1, 1e-2, 1e-2, 1e-2, 1e-2, 1e-2])
    D = dirichlet(true_alphas, N)   # training set
    
    return D
예제 #15
0
파일: app.py 프로젝트: atran/parkov
def generateMarkovChain(sample_names):
  chain = {}
  for sample in sample_names:
    sample_probs = []
    chain[sample] = sample_probs
    random_distribution = dirichlet([1] * len(sample_names))
    for i, sample_child in enumerate(sample_names):
      song_prob = (sample_child, random_distribution[i])
      sample_probs.append(song_prob)
  return chain
예제 #16
0
def get_dists(dim, num):
    """
    Returns an array of discrete distributions.

    Arguments:

    dim -- dimensionality of the distributions
    num -- number of distributions to return
    """

    return dirichlet(ones(dim), num)
예제 #17
0
def generate(num_seq, seq_length, alphabet, m_word_length, m_word_param, background_param):
	magic_thetas = [dirichlet(m_word_param) for j in range(m_word_length)]
	background_theta = dirichlet(background_param)
	sequences = []
	starts = []
	for k in range(num_seq):
		background_onehots = [multinomial(1, background_theta) for x in range(seq_length - m_word_length)]
		background = [alphabet[t] for t in [i.tolist().index(1) for i in background_onehots]]
		#background = [alphabet[t].lower() for t in [i.tolist().index(1) for i in background_onehots]]
		magic_onehots = [multinomial(1, theta) for theta in magic_thetas]
		magic_word = [alphabet[j] for j in [i.tolist().index(1) for i in magic_onehots]]
		start_pos = randint(seq_length - m_word_length)
		background[start_pos : start_pos] = magic_word
		sequences.append(background)
		starts.append(start_pos)
	#print starts
	ans = []
	ans.append(starts)
	ans.append(sequences)
	return ans
 def randommodel(self, numstates, alphabet):
     I = array(dirichlet([1] * numstates))
     F = array([0.0] * numstates)
     S = []
     # F is treated as an end of string symbol
     for i in range(numstates):
         probs = dirichlet([1] * (alphabet + 1))
         newrow = array(probs[0:alphabet])
         self.normalize(newrow)
         S.append(newrow)
         F[i] = probs[alphabet]
 
     T = []
     for i in range(alphabet):
         T.append([])
         for j in range(numstates):
             newrow = array(dirichlet([1] * numstates))
             T[i].append(newrow)
 
     return (I, F, S, T)
예제 #19
0
 def rand_init_param(self):
     logging.debug('Random param with seed: %s' % os.getpid())
     self.factors = [list() for _ in self.parts]
     # init cluster prior
     for p, prob in enumerate(dirichlet([HardEM.CLUSTER_PRIOR_ALPHA] * len(self.parts))):
         self.factors[p].append(ClusterPrior(prob))
     # init other singleton potential factors
     for p in self.parts:
         factors = self.factors[p]
         # factors.append(Binary_FC('isRealName', self.author_graph))
         # factors.append(Norm_FC('revLen', self.author_graph, (3, 7)))
         factors.append(ProdsFC('prProds', self.author_graph, self.author_product_map))
         factors.append(MembsFC('prMembs', self.author_graph))
예제 #20
0
    def sample(self,m):
        """

        Samples m samples from the current Dirichlet distribution.

        :param m: Number of samples to draw.
        :type m: int.
        :returns:  A Data object containing the samples
        :rtype:    natter.DataModule.Data

        """

        return Data(dirichlet(tuple(self.param['alpha']),m).transpose(),str(m) + ' samples from ' + self.name)
예제 #21
0
def generate_corpus(beta, mean, N):
    """
    Returns a corpus of tokens drawn from a Dirichlet--multinomial
    unigram language model. Each token is an instance of one of V
    unique word types, represented by indices 0, ..., V - 1.

    Arguments:

    beta -- concentration parameter for the Dirichlet prior
    mean -- V-dimensional mean of the Dirichlet prior
    N -- number of tokens to generate
    """

    return sample(dirichlet(beta * array(mean), 1), N)
예제 #22
0
def comp_fractions(counts, method="dirichlet", **kwargs):
    """
    Covert counts to fraction using given method.
    
    Parameters
    ----------
    method : string {dirichlet (default) | normalize | pseudo}
        dirichlet - randomly draw from the corresponding posterior 
                    Dirichlet distribution with a uniform prior.
                    That is, for a vector of counts C, 
                    draw the fractions from Dirichlet(C+1). 
        normalize - simply divide each row by its sum.
        pseudo    - add given pseudo count (defualt 1) to each count and
                    do simple normalization.


    KW Arguments
    ------------
    p_counts : int/float (default 1)
        The value of the pseudo counts to add to all counts.
        Used only if method is dirichlet
    

    Returns
    -------
    fracs: CompData
        Component fractions as a compositional data object.
    """
    from Compositions import CompData

    n, m = np.shape(counts)
    if method == "dirichlet":
        from numpy.random.mtrand import dirichlet

        fracs = CompData(np.ones((n, m)))
        method = method.lower()
        for i in xrange(n):  # for each sample
            C = counts[i, :]  # counts of each otu in sample
            a = C + 1  # dirichlet parameters
            fracs[i, :] = dirichlet(a)
    elif method == "normalize":
        temp = counts.T
        fracs = CompData((temp / temp.sum()).T)
    elif method is "pseudo":
        p_counts = kwargs.pop("p_counts", 1)
        fracs = comp_fractions(counts + p_counts, method="normalize")
    else:
        raise ValueError, 'Unsupported method "%s"' % method
    return fracs
예제 #23
0
파일: misc.py 프로젝트: neerajg/sdap
def init_params(K, L, M, N, X1, X2, no_obs):
    # TO DO : need a way to make initialization of sigma in such a way that in the beginning not too much of r gets even out because of this or gets neglected
    # (update r log exp problem)
    alphas = [random(K,),random(L,)]
    alphas[0] = alphas[0]/np.sum(alphas[0])
    alphas[1] = alphas[1]/np.sum(alphas[1])
    gammas = [randint(low = 50, high = 500, size = (M,K)) + random((M,K)), randint(low = 1.46, high = 3, size = (N,L)) + random((N,L))]
    beta_shape = (K,L,1 + X1.shape[1] + X2.shape[1])
    sigmaY_shape = (K,L)
    #randint(low = -1, high = 1, size = beta_shape) + 
    betas = [random(beta_shape), randint(low = 10, high = 50, size = sigmaY_shape) + random(sigmaY_shape)]  
 
    r1 = dirichlet(alphas[0], M)
    r1[r1<1e-4] = 1e-4
    #r1[r1>0.99] = 0.9
    r2 = dirichlet(alphas[1], N)
    r2[r2<1e-6] = 1e-6
    #r2[r2>0.9] = 0.9    
    r = [r1,r2]   

    gammas[0] = np.tile(alphas[0].reshape(1,K), (M,1)) + r[0] # M x K
    gammas[1] = np.tile(alphas[1].reshape(1,L), (N,1)) + r[1] # N x L
                  
    return alphas, gammas, betas, r
예제 #24
0
파일: models.py 프로젝트: fbkarsdorp/pevo
 def copy(self, t, models, parents):
     """
     In the exemplar based model, individuals copy the trait of other
     individuals on the basis of the extrinsic properties of those individuals. 
     With a probability of C, individuals will be biased towards copying from 
     such individuals; the rest (1 - C) copies unbiased.
     """        
     if t > 0:
         biased_parents = self.rnd.choice(self.N, size=self.N, p=dirichlet([self.alpha] * self.N))
         biased = self.rnd.rand(self.N) < self.C
         biased_parents[~biased] = parents[~biased]
         self.population = self.population[biased_parents]
         self.parents[t] = biased_parents
     else:
         self.population = models
         self.parents[t] = parents
예제 #25
0
    def __init__(self, params):
        # The word distribution of this node's topic.
        self.word_dist = dirichlet(params["topic_to_word_param"])
        self.word_cdf = util.get_cdf(self.word_dist)

        # The number of documents that pass through this node.
        self.num_documents = 0

        # Those children of this node which have looked below this level.
        # Documents that reached this node but never looked below aren't
        # represented here; this is okay because the Chinese Restaurant
        # Process is exchangeable (doesn't depend on order).
        self.children = []

        # The number of documents which looked below this level.  This
        # should always be equal to sum(c.num_documents for c in
        # self.children).
        self.num_documents_in_children = 0
예제 #26
0
def generate_data(V, D, l, alpha, beta):
    """
    Generates a synthetic corpus of documents from a Dirichlet process
    mixture model with multinomial mixture components (topics). The
    mixture components are drawn from a symmetric Dirichlet prior.

    Arguments:

    V -- vocabulary size
    D -- number of documents
    l -- average document length
    alpha -- concentration parameter for the Dirichlet process
    beta -- concentration parameter for the symmetric Dirichlet prior
    """

    T = D # maximum number of topics

    phi_TV = zeros((T, V))
    z_D = zeros(D, dtype=int)
    N_DV = zeros((D, V), dtype=int)

    for d in xrange(D):

        # draw a topic assignment for this document

        dist = bincount(z_D).astype(float)
        dist[0] = alpha
        [t] = sample(dist)
        t = len(dist) if t == 0 else t
        z_D[d] = t

        # if it's a new topic, draw the parameters for that topic

        if t == len(dist):
            phi_TV[t - 1, :] = dirichlet(beta * ones(V) / V)

        # draw the tokens from the topic

        for v in sample(phi_TV[t - 1, :], num_samples=poisson(l)):
            N_DV[d, v] += 1

    z_D = z_D - 1

    return phi_TV, z_D, N_DV
예제 #27
0
파일: tm.py 프로젝트: sbos/twtm
def E_step(w, phi, alpha, beta, p0, L, n=None, theta=None, maxiter=100, resampling=True, smoothing=False):
    T, N = w.shape
    K, V = phi.shape

    log_theta = None

    if n == None:
        n = dirichlet(alpha, size=T) * N
    if theta == None:
        theta, pt, log_theta = q_theta(n, alpha, p0, L)
    # if theta == None:
    #     theta = dirichlet(alpha, size=T)
    #     log_theta = np.log(theta)
    # if n == None:
    #     n = theta * N

    log_phi = np.log(phi)

    pt = None
    likelihood_log = np.zeros(maxiter, dtype=float)
    theta_log = np.zeros((maxiter, T, K), dtype=float)

    for iteration in xrange(maxiter):
        z = q_z(w, log_theta, phi)
        #z = q_z_alt(w, theta, phi)
        n = z.sum(axis=1)
        new_theta, pt, new_log_theta = q_theta(n, alpha, p0, L, resampling=resampling,
            smoothing=smoothing) 

        #set_trace()
        diff = np.abs(theta - new_theta)
        avg_diff = diff.mean()
        max_diff = diff.max()

        likelihood_log[iteration] = likelihood(w, z, np.log(new_theta), log_phi)
        print 'iteration %d. avg diff: %f. max diff: %f. likelihood: %f' %\
         (iteration, avg_diff, max_diff, likelihood_log[iteration])

        theta_log[iteration] = theta
        log_theta = new_log_theta
        theta = new_theta

    return z, theta, pt, likelihood_log, theta_log
예제 #28
0
def computePrior(options):
    num = options.num
    clusterAlpha = options.clusterAlpha
    balanced = options.balanced
    numClusters = options.numClusters

    assert (num > 0)

    if numClusters is None:
        numClusters = max(int(log(num, 1.5)), 2)

    print(sys.stderr, numClusters, "clusters")

    clusterPrior = [clusterAlpha] * numClusters
    if not balanced:
        clusterPrior = dirichlet(clusterPrior)
    else:
        norm = sum(clusterPrior)
        clusterPrior = [xx / norm for xx in clusterPrior]

    return clusterPrior
예제 #29
0
def computePrior(options):
    num = options.num
    clusterAlpha = options.clusterAlpha
    balanced = options.balanced
    numClusters = options.numClusters

    assert(num > 0)

    if numClusters is None:
        numClusters = max(int(log(num, 1.5)), 2)

    print(sys.stderr, numClusters, "clusters")

    clusterPrior = [clusterAlpha] * numClusters
    if not balanced:
        clusterPrior = dirichlet(clusterPrior)
    else:
        norm = sum(clusterPrior)
        clusterPrior = [xx/norm for xx in clusterPrior]

    return clusterPrior
예제 #30
0
def prob5(data, iters=5000):
    post = []
    post_temp = namedtuple('post', 'style, abv, post_prob')
    for label, vals in data.items():
        counts = Counter(vals)
        freq_data = [(i, j) for i, j in counts.items()]
        keys = [i[0] for i in freq_data]
        obs = [i[1] for i in freq_data]
        results = []
        for n in xrange(iters):
            samp = dirichlet([x for x in obs])
            results.append(samp)
        results = np.array(results)
        probs = np.mean(results, axis=0)
        n_data = zip(keys, probs)
        for key, prob in n_data:
            if prob is not None and key == 5:
                datum = post_temp(style=label[1], abv=label[0], post_prob=prob)
                post.append(datum)

    return post
예제 #31
0
 def to_fractions(self, method='dirichlet', **kwargs):
     '''
     Convert counts to fraction, either by simple normalization or adding pseudo counts, or dirichlet sampling.
     If dirichlet sampling is used, for each sample (col) fit a dirichlet distribution and sample the fraction from it.
     The prior is a uniform dirichlet (a = ones(len(otus)) )
     Return a new instance.
     '''
     if method is 'normalize': fracs = self.normalize()
     elif method is 'pseudo':
         p_counts = kwargs.get('p_counts', 1)
         fracs = (self + p_counts).normalize()
     else:
         from numpy.random.mtrand import dirichlet
         mat, row_labels, col_labels = self.to_matrix()
         for i in range(len(col_labels)):  # for each sample
             N = mat[:, i]  # counts of each otu in sample
             a = N + 1  # dirichlet parameters
             f = dirichlet(a)  # fractions are random sample from dirichlet
             mat[:, i] = f
         fracs = self.remove_rows(self.row_labels())
         fracs.from_matrix(mat, row_labels, col_labels)
     return fracs
예제 #32
0
 def to_fractions(self, method = 'dirichlet', **kwargs):
     '''
     Convert counts to fraction, either by simple normalization or adding pseudo counts, or dirichlet sampling.
     If dirichlet sampling is used, for each sample (col) fit a dirichlet distribution and sample the fraction from it.
     The prior is a uniform dirichlet (a = ones(len(otus)) )
     Return a new instance.
     '''
     if method is 'normalize': fracs = self.normalize()
     elif method is 'pseudo': 
         p_counts = kwargs.get('p_counts',1) 
         fracs = (self+p_counts).normalize()
     else:
         from numpy.random.mtrand import dirichlet
         mat, row_labels, col_labels = self.to_matrix()
         for i in range(len(col_labels)): # for each sample
             N        = mat[:,i]     # counts of each otu in sample
             a        = N+1          # dirichlet parameters
             f        = dirichlet(a) # fractions are random sample from dirichlet
             mat[:,i] = f
         fracs = self.remove_rows(self.row_labels())
         fracs.from_matrix(mat, row_labels, col_labels)
     return fracs
def generate_data(T, K, beta, n=None):
    """
    T is the number of timesteps
    K is the dimensionality
    beta is the concentration parameter
    n is the base measure
    """

    if n == None:
        n = ones(K) / K  # uniform base measure

    x = zeros(T)  # observations
    r = zeros(T)  # run lengths

    C = []  # changepoints

    for t in range(1, T + 1):

        # sample run length

        if t == 1:
            r[t - 1] = 0
        else:
            if uniform() < hazard([r[t - 2] + 1]):
                r[t - 1] = 0
            else:
                r[t - 1] = r[t - 2] + 1

        # sample new parameters if run length is zero

        if r[t - 1] == 0:
            C.append(t)
            [phi] = dirichlet(beta * n, 1)

        x[t - 1] = categorical(phi)  # sample data

    return x, r, C
예제 #34
0
def generate_ratings(num_types,
                     num_users,
                     ratings_per_user=20,
                     num_items=100,
                     alpha=None,
                     noise=-1,
                     plsi=False):
    p = Poisson(ratings_per_user)
    ratings = [[rint(1, 5) for i in range(num_items)]
               for i in range(num_types)]
    if alpha == None:
        alpha = [1] * num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
        else:
            type_dist = dirichlet(alpha)
        type_dists.append(type_dist)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                rating.append(rint(1, 5))
            else:
                type = sample(type_dist)
                rating.append(ratings[type][j])
            indices.append(j)
        user_ratings.append(rating)
        user_indices.append(indices)
    user_ratings = user_indices, user_ratings

    return user_ratings, ratings, type_dists
예제 #35
0
 def rand_init(self):
     p = np.clip(dirichlet([1] * 2)[0], EPS, 1 - EPS)
     self.log_val = np.log(p)
     self.log_1_val = np.log(1 - p)
예제 #36
0
 def rand_init(self):
     self.log_pr_prod = np.log(
         dirichlet([HardEM.PROD_PRIOR_ALPHA] *
                   self.n_all_membs))  # near uniform initialization
예제 #37
0
                count += 1
                idx = top20k.index(w)
                if idx in numwords:
                    numwords[idx] += 1
                else:
                    numwords[idx] = 1
    return (header, numwords, count)


result = lines.map(countWords)
result.cache()

alpha = [0.1] * 20
beta = np.array([0.1] * 20000)

pi = dirichlet(alpha).tolist()  # *** vector gives prevalence of each category
mu = np.array([
    dirichlet(beta) for j in range(20)
])  # *** prob vector prevelence of each word of category in each doc
log_mu = np.log(mu)
header = result.map(lambda x: x[0]).collect()
x = result.map(lambda x: x[1]).map(
    map_to_array).cache()  # *** Num of occurance of each word in each doc.


# getProbs accepts four parameters:
#
# checkParams: set to true if you want a check on all of the params
#   that makes sure that everything looks OK. This will make the
#   function run slower; use only for debugging
#
예제 #38
0
def sample_post(hp, ss):
    return dirichlet(ss.counts + hp.alphas)
예제 #39
0
def gibbs_sep_doc(alpha,Pi,A,word_doc,z_doc,doc_list,\
                  z_count_all,p_md_all,E_theta_all,E_m_d_theta_all,\
                  X,no_of_itr,vocabSize,proc_id):

    no_of_topics = A.shape[1] 
    K = alpha.shape[0] # No of mixture components
    multi_rand = np.random.multinomial(1,[1/no_of_topics]*no_of_topics,\
                                        size=(vocabSize))
    z_init = np.array([mat.argmax() for mat in multi_rand])
    theta = np.ones([no_of_topics])*(1/no_of_topics)

    numdocs = doc_list.shape[0]
    p_M = np.zeros([K])
    E_theta = np.zeros([no_of_topics])
    E_m_d_theta = np.zeros([K,no_of_topics])
    z_count = np.zeros([vocabSize,no_of_topics])
               
    # iterating through every document  
    idx = 0
    for doc_index in doc_list:
        word_indices = word_doc[idx]
        z_d = z_doc[idx]
        E_theta.fill(0)
        E_m_d_theta.fill(0)
        p_M.fill(0)
        z_count.fill(0)
        for i in range(no_of_itr + X):    

            # Sampling the mixture component
            p_theta_gm = np.empty(K)
            for k in range(K):
                p_theta_gm[k] = dirichlet_log_prob(theta,alpha[k])
            log_p_md = log_np_array(Pi) + p_theta_gm
            norm_log_pmd = (log_p_md - compute_log_sum(log_p_md))
            p_md = np.array([math.exp(m) for m in norm_log_pmd])
            p_md = p_md/p_md.sum()
            M = np.random.multinomial(1,p_md,size=1).argmax()

            # Sampling theta
            alpha_d = alpha[M]
            topics_count = count_topics(z_d, word_indices, no_of_topics)
            alpha_p = alpha_d + topics_count
            local_theta = dirichlet(alpha_p, size=1)
            local_theta = np.array([remove_zero(m) for m in local_theta[0]])
            theta = local_theta/local_theta.sum()
            
            w_count = 0
            for w_index in word_indices:
                # iterating through every word in document to sample z
                p_zd = A[w_index] * theta
                p_zd = p_zd/p_zd.sum()
                word_topic = np.random.multinomial(1,p_zd,size=1).argmax()
                z_d[w_count] = word_topic
                w_count = w_count + 1 

                if (i >= X):                  
                    z_count[w_index, word_topic] += 1 
                
            # Saving only theta and M samples from every iteration 
            # only burning-in
            if (i >= X):
                p_M[M] += 1 
                E_theta += theta 
                E_m_d_theta[M] += log(theta)

        idx += 1

        # Communicating the array's back to the parent process
        z_count_all[doc_index] = z_count
        p_md_all[doc_index] = p_M/no_of_itr
        E_theta_all[doc_index] = E_theta/no_of_itr
        E_m_d_theta_all[doc_index] = E_m_d_theta/no_of_itr
예제 #40
0
def iteration(V, D, N_DV, N_D, alpha, beta, phi_TV, z_D, inv_z_T, active_topics, inactive_topics, N_TV, N_T, D_T, num_inner_itns):
    """
    Performs a single iteration of Metropolis-Hastings (split-merge).
    """

    phi_s_V = empty(V)
    phi_t_V = empty(V)
    phi_merge_t_V = empty(V)

    N_s_V = empty(V, dtype=int)
    N_t_V = empty(V, dtype=int)
    N_merge_t_V = empty(V, dtype=int)

    log_dist = empty(2)

    d, e = choice(D, 2, replace=False) # choose 2 documents

    if z_D[d] == z_D[e]:
        s = inactive_topics.pop()
        active_topics.add(s)
    else:
        s = z_D[d]

    inv_z_s = set([d])
    N_s_V[:] = N_DV[d, :]
    N_s = N_D[d]
    D_s = 1

    t = z_D[e]
    inv_z_t = set([e])
    N_t_V[:] = N_DV[e, :]
    N_t = N_D[e]
    D_t = 1

    inv_z_merge_t = set([d, e])
    N_merge_t_V[:] = N_DV[d, :] + N_DV[e, :]
    N_merge_t = N_D[d] + N_D[e]
    D_merge_t = 2

    if z_D[d] == z_D[e]:
        idx = inv_z_T[t] - set([d, e])
    else:
        idx = (inv_z_T[s] | inv_z_T[t]) - set([d, e])

    for f in idx:
        if uniform() < 0.5:
            inv_z_s.add(f)
            N_s_V += N_DV[f, :]
            N_s += N_D[f]
            D_s += 1
        else:
            inv_z_t.add(f)
            N_t_V += N_DV[f, :]
            N_t += N_D[f]
            D_t += 1

        inv_z_merge_t.add(f)
        N_merge_t_V += N_DV[f, :]
        N_merge_t += N_D[f]
        D_merge_t += 1

    if z_D[d] == z_D[e]:
        phi_merge_t_V[:] = phi_TV[t, :]
    else:
        phi_merge_t_V = dirichlet(N_merge_t_V + beta / V)

    acc = 0.0

    for inner_itn in xrange(num_inner_itns):

        # sample new parameters for topics s and t ... but if it's the
        # last iteration and we're doing a merge, then just set the
        # parameters back to phi_TV[s, :] and phi_TV[t, :]

        if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]:
            phi_s_V[:] = phi_TV[s, :]
            phi_t_V[:] = phi_TV[t, :]
        else:
            phi_s_V = dirichlet(N_s_V + beta / V)
            phi_t_V = dirichlet(N_t_V + beta / V)

        if inner_itn == num_inner_itns - 1:

            acc += gammaln(N_s + beta)
            acc -= gammaln(N_s_V + beta / V).sum()
            acc += ((N_s_V + beta / V - 1) * log(phi_s_V)).sum()

            acc += gammaln(N_t + beta)
            acc -= gammaln(N_t_V + beta / V).sum()
            acc += ((N_t_V + beta / V - 1) * log(phi_t_V)).sum()

            acc -= gammaln(N_merge_t + beta)
            acc += gammaln(N_merge_t_V + beta / V).sum()
            acc -= ((N_merge_t_V + beta / V - 1) *
                    log(phi_merge_t_V)).sum()

        for f in idx:

            # (fake) restricted Gibbs sampling scan

            if f in inv_z_s:
                inv_z_s.remove(f)
                N_s_V -= N_DV[f, :]
                N_s -= N_D[f]
                D_s -= 1
            else:
                inv_z_t.remove(f)
                N_t_V -= N_DV[f, :]
                N_t -= N_D[f]
                D_t -= 1

            log_dist[0] = log(D_s)
            log_dist[0] += (N_DV[f, :] * log(phi_s_V)).sum()

            log_dist[1] = log(D_t)
            log_dist[1] += (N_DV[f, :] * log(phi_t_V)).sum()

            log_dist -= log_sum_exp(log_dist)

            if inner_itn == num_inner_itns - 1 and z_D[d] != z_D[e]:
                u = 0 if z_D[f] == s else 1
            else:
                [u] = log_sample(log_dist)

            if u == 0:
                inv_z_s.add(f)
                N_s_V += N_DV[f, :]
                N_s += N_D[f]
                D_s += 1
            else:
                inv_z_t.add(f)
                N_t_V += N_DV[f, :]
                N_t += N_D[f]
                D_t += 1

            if inner_itn == num_inner_itns - 1:
                acc += log_dist[u]

    if z_D[d] == z_D[e]:

        acc *= -1.0

        acc += log(alpha)
        acc += gammaln(D_s) + gammaln(D_t) - gammaln(D_T[t])
        tmp = beta / V
        acc += gammaln(beta) - V * gammaln(tmp)
        acc += (tmp - 1) * (log(phi_s_V).sum() + log(phi_t_V).sum())
        acc -= (tmp - 1) * log(phi_TV[t, :]).sum()

        acc += (N_s_V * log(phi_s_V)).sum() + (N_t_V * log(phi_t_V)).sum()
        acc -= (N_TV[t, :] * log(phi_TV[t, :])).sum()

        if log(uniform()) < min(0.0, acc):
            phi_TV[s, :] = phi_s_V
            phi_TV[t, :] = phi_t_V
            z_D[list(inv_z_s)] = s
            z_D[list(inv_z_t)] = t
            inv_z_T[s] = inv_z_s
            inv_z_T[t] = inv_z_t
            N_TV[s, :] = N_s_V
            N_TV[t, :] = N_t_V
            N_T[s] = N_s
            N_T[t] = N_t
            D_T[s] = D_s
            D_T[t] = D_t
        else:
            active_topics.remove(s)
            inactive_topics.add(s)

    else:

        acc -= log(alpha)
        acc += gammaln(D_merge_t) - gammaln(D_T[s]) - gammaln(D_T[t])
        tmp = beta / V
        acc += V * gammaln(tmp) - gammaln(beta)
        acc += (tmp - 1) * log(phi_merge_t_V).sum()
        acc -= (tmp - 1) * (log(phi_TV[s, :]).sum() + log(phi_TV[t, :]).sum())

        acc += (N_merge_t_V * log(phi_merge_t_V)).sum()
        acc -= ((N_TV[s, :] * log(phi_TV[s, :])).sum() +
                (N_TV[t, :] * log(phi_TV[t, :])).sum())

        if log(uniform()) < min(0.0, acc):
            phi_TV[s, :] = zeros(V)
            phi_TV[t, :] = phi_merge_t_V
            active_topics.remove(s)
            inactive_topics.add(s)
            z_D[list(inv_z_merge_t)] = t
            inv_z_T[s].clear()
            inv_z_T[t] = inv_z_merge_t
            N_TV[s, :] = zeros(V, dtype=int)
            N_TV[t, :] = N_merge_t_V
            N_T[s] = 0
            N_T[t] = N_merge_t
            D_T[s] = 0
            D_T[t] = D_merge_t
예제 #41
0
def generate_docs(num_topics,
                  num_docs,
                  words_per_doc=50,
                  vocab_size=30,
                  alpha=0.001,
                  beta=0.01,
                  noise=-1,
                  plsi=False):
    """Generates documents according to plsi or lda
    
    Args:
        num_topics: 
            the number of underlying latent topics
        num_docs: 
            the number of documents to generate
        words_per_doc: 
            parameter to a Poisson distribution;
            determines the average words in a documents
        vocab_size: 
            the number of words in the vocabulary
        DIRICHLET PARAMETERS
        ---------------------
        Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
        ---------------------
        alpha: 
            parameter to dirichlet distribution for topics
        beta: 
            parameter to dirichlet distribution for words
        noise: 
            given as a probability; each word will be replaced with a random
            word with noise probability
        plsi:
            flag to determine which distribution to draw from,
            a random distribution or a sample from a dirichlet distribution
            
    Returns:
        docs:
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
        word_dist:
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
        topics_dist:
            the distribution over topics for each document;
            each row is the distribution for a different document
    """
    p = Poisson(words_per_doc)

    alpha = [alpha] * num_topics
    beta = [beta] * num_topics

    if plsi:
        word_dist = [
            normalize([rand() for w in range(vocab_size)])
            for t in range(num_topics)
        ]
    else:
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:
        word_cdfs.append(get_cdf(topic))

    topic_cdfs = []
    docs = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        words_per_doc = p.sample()
        doc = []
        if plsi:
            topic_dist = normalize([rand() for t in range(num_topics)])
        else:
            topic_dist = dirichlet(alpha)
        topic_dists.append(topic_dist)
        topic_cdf = get_cdf(topic_dist)
        topic_cdfs.append(topic_cdf)
        for word in range(words_per_doc):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
            else:
                topic = sample(topic_cdf)
                doc.append(sample(word_cdfs[topic]))
        docs.append(doc)
        doc_index += 1
    return docs, word_dist, topic_dists
예제 #42
0
 def dir_fun(x):
     a = x+p_counts
     f = dirichlet(a)
     return f
예제 #43
0
파일: dpm.py 프로젝트: yarden/distributions
def _sample_post(hp, ss):
    values = (hp.betas * hp.alpha).tolist()
    for i, count in ss.counts.iteritems():
        values[i] += count
    values.append(hp.beta0 * hp.alpha)
    return dirichlet(values)
예제 #44
0
        write_proto("%s-%i.index" % (filename, div), c)


if __name__ == "__main__":
    flags.InitFlags()

    beta = {}
    eta = zeros(flags.num_topics)
    vocab_total = defaultdict(int)
    for ii in xrange(flags.num_topics):
        eta[ii] = ii + random() * float(ii)
        for ll in xrange(flags.num_langs):
            print ml_vocab[ii]
            print ml_vocab[ii][ll]
            gamma = [flags.gamma] * len(ml_vocab[ii][ll])
            beta[(ll, ii)] = dirichlet(gamma)
            print "BETA", (ll, ii), beta[(ll, ii)]

    theta = {}
    alpha = [flags.alpha / float(flags.num_topics)] * flags.num_topics

    docs = defaultdict(Doc)

    print "Variance", flags.variance, flags.variance > 0

    for ll in xrange(flags.num_langs):
        for ii in [(ll, x) for x in xrange(flags.num_docs)]:
            z_bar = zeros(flags.num_topics)
            theta[ii] = DirichletDraw(alpha)
            docs[ii].lang = ll
            docs[ii].theta = theta[ii]
    i = 0
    while i < 20000:
        if i in mapping:
            count_lst[i] = mapping[i]
        i += 1
    return np.array(count_lst)


# calculate term frequency vector for each document
result = lines.map(countWords)
result.cache()

alpha = [0.1] * 20
beta = np.array([0.1] * 20000)

pi = dirichlet(alpha).tolist()
mu = np.array([dirichlet(beta) for j in range(20)])
log_mu = np.log(mu)
header = result.map(lambda x: x[0]).collect()
n = result.count()
l = result.map(lambda x: x[2]).collect()
x = result.map(lambda x: x[1]).map(map_to_array).cache()

label = result.map(lambda x: (x[0], x[1])).map(
    lambda x: ((x[0][x[0].index('id="') + 18:x[0].index('url')]), x[1])).map(
        lambda x: (x[0][:x[0].index("/")], map_to_array(x[1]))).cache()

labels = np.array(label.map(lambda x: x[0]).distinct().collect())


def getProbs(checkParams, log_allMus, x, log_pi):
예제 #46
0
            bag_of_word_num.append(globalWords[word])

        matrix = []

        for glosa in querys:
            tok = token(glosa)
            arrQuery = [0]*len(bag_of_word)
            for j  in tok:
                arrQuery[bag_of_word.index( j[0] )] = j[1]
            matrix.append(arrQuery)
        
        # random matrix
        Matrix_U = []
        
        for i in range(0,len(matrix)):
            Ui = dirichlet([1] * 3)
            Matrix_U.append(Ui)
        
        c = FuzzyCMeans(matrix, Matrix_U)
        c(kmax)
        inf = 0
        nav = 0
        res = 0
        categoryArray = []
        for val in c.mu:
            cat = min_val(val[0], val[1], val[2])
            categoryArray.append(cat)
            if cat == 'INF':
                inf += 1
            elif cat == 'NAV':
                nav +=1
예제 #47
0
def DirichletDraw(alpha):
    t = dirichlet(alpha)
    while isnan(t[0]) or isinf(t[0]):
        t = dirichlet(alpha)
    return t