示例#1
0
 def MakeData(self, N=10000):
     S1 = np.asarray([[100, 0], [0, 0.01]])
     Sigma = np.zeros((2, 2, 4))
     Sigma[:, :, 0] = S1
     Sigma[:, :, 1] = RandUtil.rotateCovMat(S1, theta=np.pi / 4)
     Sigma[:, :, 2] = RandUtil.rotateCovMat(S1, theta=2 * np.pi / 4)
     Sigma[:, :, 3] = RandUtil.rotateCovMat(S1, theta=3 * np.pi / 4)
     self.Sigma = Sigma
     Xlist = list()
     Rlist = list()
     for k in range(Sigma.shape[2]):
         curX = RandUtil.mvnrand([0, 0], Sigma[:, :, k], N)
         curresp = np.zeros((N, 4))
         curresp[:, k] = 1.0
         Xlist.append(curX)
         Rlist.append(curresp)
     X = np.vstack(Xlist)
     self.Data = XData(X=X)
     self.trueresp = np.vstack(Rlist)
 def MakeData(self, N=10000):
   S1 = np.asarray([[100, 0], [0, 0.01]])
   Sigma = np.zeros( (2,2,4))
   Sigma[:,:,0] = S1
   Sigma[:,:,1] = RandUtil.rotateCovMat(S1, theta=np.pi/4)
   Sigma[:,:,2] = RandUtil.rotateCovMat(S1, theta=2*np.pi/4)
   Sigma[:,:,3] = RandUtil.rotateCovMat(S1, theta=3*np.pi/4)
   self.Sigma = Sigma
   Xlist = list()
   Rlist = list()
   for k in range(Sigma.shape[2]):
     curX = RandUtil.mvnrand([0,0], Sigma[:,:,k], N)
     curresp = np.zeros((N,4))
     curresp[:,k] = 1.0
     Xlist.append(curX)
     Rlist.append(curresp)
   X = np.vstack(Xlist)
   self.Data = XData(X=X)
   self.trueresp = np.vstack(Rlist)
示例#3
0
    def CreateToyDataFromMixModel(cls, seed=101,
                                  nDocTotal=None,
                                  nWordsPerDoc=None,
                                  nWordsPerDocFunc=None,
                                  beta=None,
                                  topics=None,
                                  **kwargs):
        ''' Generates BagOfWordsData dataset via mixture generative model.

        Returns
        ------
        Data : BagOfWordsData object
        '''
        from bnpy.util import RandUtil
        PRNG = np.random.RandomState(seed)

        K = topics.shape[0]
        V = topics.shape[1]
        # Make sure topics sum to one
        topics = topics / topics.sum(axis=1)[:, np.newaxis]
        assert K == beta.size

        doc_range = np.zeros(nDocTotal + 1)
        wordIDsPerDoc = list()
        wordCountsPerDoc = list()

        resp = np.zeros((nDocTotal, K))
        Ks = list(range(K))

        # startPos : tracks start index for current doc within corpus-wide
        # lists
        startPos = 0
        for d in range(nDocTotal):
            # Draw single topic assignment for this doc
            k = RandUtil.choice(Ks, beta, PRNG)
            resp[d, k] = 1

            # Draw the observed words for this doc
            # wordCountBins: V x 1 vector, entry v counts appearance of word v
            wordCountBins = RandUtil.multinomial(nWordsPerDoc,
                                                 topics[k, :], PRNG)

            # Record word_id, word_count, doc_range
            wIDs = np.flatnonzero(wordCountBins > 0)
            wCounts = wordCountBins[wIDs]
            assert np.allclose(wCounts.sum(), nWordsPerDoc)
            wordIDsPerDoc.append(wIDs)
            wordCountsPerDoc.append(wCounts)
            doc_range[d] = startPos
            startPos += wIDs.size

        # Package up all data
        word_id = np.hstack(wordIDsPerDoc)
        word_count = np.hstack(wordCountsPerDoc)
        doc_range[-1] = word_count.size

        # Make TrueParams dict
        TrueParams = dict(K=K, topics=topics, beta=beta, resp=resp)

        Data = BagOfWordsData(
            word_id, word_count, doc_range, V, TrueParams=TrueParams)
        return Data
示例#4
0
    def CreateToyDataFromLDAModel(cls, seed=101,
                                  nDocTotal=None,
                                  nWordsPerDoc=None,
                                  nWordsPerDocFunc=None,
                                  topic_prior=None, topics=None,
                                  alpha=None, proba_K=None,
                                  **kwargs):
        ''' Generates BagOfWordsData dataset via LDA generative model.


        Returns
        ------
        Data : BagOfWordsData object
        '''
        if topic_prior is None:
            topic_prior = alpha * proba_K
        from bnpy.util import RandUtil

        K = topics.shape[0]
        V = topics.shape[1]
        # Make sure topics sum to one
        topics = topics / topics.sum(axis=1)[:, np.newaxis]
        assert K == topic_prior.size

        doc_range = np.zeros(nDocTotal + 1)
        wordIDsPerDoc = list()
        wordCountsPerDoc = list()

        Pi = np.zeros((nDocTotal, K))
        respPerDoc = list()

        # startPos : tracks start index for current doc within corpus-wide
        # lists
        startPos = 0
        for d in range(nDocTotal):
            # Need docseed to be type int, have non-zero value for all d
            docseed = (seed * d + seed) % (100000000)
            PRNG = np.random.RandomState(docseed)

            # Draw topic appearance probabilities for this document
            Pi[d, :] = PRNG.dirichlet(topic_prior)

            if nWordsPerDocFunc is not None:
                nWordsPerDoc = nWordsPerDocFunc(PRNG)

            # Draw the topic assignments for this doc
            # Npercomp : K-vector, Npercomp[k] counts appearance of topic k
            Npercomp = RandUtil.multinomial(nWordsPerDoc, Pi[d, :], PRNG)

            # Draw the observed words for this doc
            # wordCountBins: V x 1 vector, entry v counts appearance of word v
            wordCountBins = np.zeros(V)
            for k in range(K):
                wordCountBins += RandUtil.multinomial(Npercomp[k],
                                                      topics[k, :], PRNG)

            # Record word_id, word_count, doc_range
            wIDs = np.flatnonzero(wordCountBins > 0)
            wCounts = wordCountBins[wIDs]
            assert np.allclose(wCounts.sum(), nWordsPerDoc)
            wordIDsPerDoc.append(wIDs)
            wordCountsPerDoc.append(wCounts)
            doc_range[d] = startPos
            startPos += wIDs.size

            # Record expected local parameters (LP)
            curResp = (topics[:, wIDs] * Pi[d, :][:, np.newaxis]).T
            respPerDoc.append(curResp)

        word_id = np.hstack(wordIDsPerDoc)
        word_count = np.hstack(wordCountsPerDoc)
        doc_range[-1] = word_count.size

        # Make TrueParams dict
        resp = np.vstack(respPerDoc)
        resp /= resp.sum(axis=1)[:, np.newaxis]
        TrueParams = dict(K=K, topics=topics,
                          beta=topic_prior / topic_prior.sum(),
                          topic_prior=topic_prior, resp=resp)

        Data = BagOfWordsData(
            word_id, word_count, doc_range, V, TrueParams=TrueParams)
        return Data