示例#1
0
def packParamBagForPost(nu=None,
                        beta=None,
                        m=None,
                        kappa=None,
                        D=None,
                        Post=None,
                        **kwargs):
    '''
    '''
    m = as2D(m)
    beta = as2D(beta)

    if D is None:
        D = m.shape[1]

    if m.shape[1] != D:
        m = m.T.copy()
    if beta.shape[1] != D:
        beta = beta.T.copy()
    K, _ = m.shape
    if Post is None:
        Post = ParamBag(K=K, D=D)
    else:
        assert isinstance(Post, ParamBag)
        assert Post.K == K
        assert Post.D == D
    Post.setField('nu', as1D(nu), dims=('K'))
    Post.setField('beta', beta, dims=('K', 'D'))
    Post.setField('m', m, dims=('K', 'D'))
    Post.setField('kappa', as1D(kappa), dims=('K'))
    return Post
def packParamBagForPost(pnu_K=None,
                        ptau_K=None,
                        w_KE=None,
                        P_KEE=None,
                        Post=None,
                        **kwargs):
    ''' Parse provided array args and pack into parameter bag

    Returns
    -------
    Post : ParamBag, with K clusters
    '''
    pnu_K = as1D(pnu_K)
    ptau_K = as1D(ptau_K)
    w_KE = as2D(w_KE)
    P_KEE = as3D(P_KEE)

    K = pnu_K.size
    E = w_KE.shape[1]
    if Post is None:
        Post = ParamBag(K=K, D=E - 1, E=E)
    elif not hasattr(Post, 'E'):
        Post.E = E
    assert Post.K == K
    assert Post.D == E - 1
    assert Post.E == E
    Post.setField('pnu_K', pnu_K, dims=('K'))
    Post.setField('ptau_K', ptau_K, dims=('K'))
    Post.setField('w_KE', w_KE, dims=('K', 'E'))
    Post.setField('P_KEE', P_KEE, dims=('K', 'E', 'E'))
    return Post
示例#3
0
    def setPostFactors(self,
                       obsModel=None,
                       SS=None,
                       LP=None,
                       Data=None,
                       nu=0,
                       B=0,
                       **kwargs):
        ''' Set attribute Post to provided values.
        '''
        self.ClearCache()
        if obsModel is not None:
            if hasattr(obsModel, 'Post'):
                self.Post = obsModel.Post.copy()
                self.K = self.Post.K
            else:
                self.setPostFromEstParams(obsModel.EstParams)
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updatePost(SS)
        else:
            K = B.shape[0]
            self.Post = ParamBag(K=K, D=self.D)
            self.Post.setField('nu', as1D(nu), dims=('K'))
            self.Post.setField('B', B, dims=('K', 'D', 'D'))
        self.K = self.Post.K
示例#4
0
def getPrefixForLapQuery(taskpath, lapQuery):
    ''' Search among checkpoint laps for one nearest to query.

    Returns
    --------
    prefix : str
        For lap 1, prefix = 'Lap0001.000'.
        For lap 5.5, prefix = 'Lap0005.500'.
    lap : int
        lap checkpoint for saved params close to lapQuery
    '''
    try:
        saveLaps = np.loadtxt(os.path.join(taskpath, 'snapshot_lap.txt'))
    except IOError:
        fileList = glob.glob(os.path.join(taskpath, 'Lap*Topic*'))
        if len(fileList) == 0:
            fileList = glob.glob(os.path.join(taskpath, 'Lap*.log_prob_w'))
        assert len(fileList) > 0
        saveLaps = list()
        for fpath in sorted(fileList):
            basename = fpath.split(os.path.sep)[-1]
            lapstr = basename[3:11]
            saveLaps.append(float(lapstr))
        saveLaps = np.sort(np.asarray(saveLaps))

    saveLaps = as1D(saveLaps)
    if lapQuery is None:
        bestLap = saveLaps[-1]  # take final saved value
    else:
        distances = np.abs(lapQuery - saveLaps)
        bestLap = saveLaps[np.argmin(distances)]
    return makePrefixForLap(bestLap), bestLap
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    if not hasattr(Data, 'X_NE'):
        Data.X_NE = np.hstack([Data.X, np.ones(Data.nObs)[:, np.newaxis]])

    Y_N = Data.Y
    X_NE = Data.X_NE
    E = X_NE.shape[1]

    if 'resp' in LP:
        # Dense responsibility calculations
        resp = LP['resp']
        K = resp.shape[1]
        S_yy_K = dotATB(resp, np.square(Y_N)).flatten()
        S_yx_KE = dotATB(resp, Y_N * X_NE)

        # Expected outer product
        S_xxT_KEE = np.zeros((K, E, E))
        sqrtResp_k_N = np.sqrt(resp[:, 0])
        sqrtR_X_k_NE = sqrtResp_k_N[:, np.newaxis] * X_NE
        S_xxT_KEE[0] = dotATA(sqrtR_X_k_NE)
        for k in xrange(1, K):
            np.sqrt(resp[:, k], out=sqrtResp_k_N)
            np.multiply(sqrtResp_k_N[:, np.newaxis], X_NE, out=sqrtR_X_k_NE)
            S_xxT_KEE[k] = dotATA(sqrtR_X_k_NE)
    else:
        raise ValueError("TODO")
        spR = LP['spR']
        K = spR.shape[1]

    if SS is None:
        SS = SuffStatBag(K=K, D=Data.dim, E=E)
    elif not hasattr(SS, 'E'):
        SS._Fields.E = E
    SS.setField('xxT_KEE', S_xxT_KEE, dims=('K', 'E', 'E'))
    SS.setField('yx_KE', S_yx_KE, dims=('K', 'E'))
    SS.setField('yy_K', S_yy_K, dims=('K'))
    # Expected count for each k
    # Usually computed by allocmodel. But just in case...
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')

    #SS.setField("N_K", SS.N, dims="K")
    return SS
示例#6
0
    def calcSmoothedMu(self, X, W=None):
        ''' Compute smoothed estimate of mean of statistic xxT.

        Args
        ----
        X : 2D array, size N x D

        Returns
        -------
        Mu_1 : 2D array, size D x D
            Expected value of Cov[ X[n] ]
        Mu_2 : 1D array, size D
            Expected value of Mean[ X[n] ]
        '''
        if X is None:
            Mu1 = self.Prior.B / self.Prior.nu
            Mu2 = self.Prior.m
            return Mu1, Mu2

        if X.ndim == 1:
            X = X[np.newaxis, :]
        N, D = X.shape
        # Compute suff stats
        if W is None:
            sum_wxxT = np.dot(X.T, X)
            sum_wx = np.sum(X, axis=0)
            sum_w = X.shape[0]
        else:
            W = as1D(W)
            sqrtWX = np.sqrt(W)[:, np.newaxis] * X
            sum_wxxT = np.dot(sqrtWX.T, sqrtWX)
            sum_wx = np.dot(W, X)
            sum_w = np.sum(W)

        kappa = self.Prior.kappa + sum_w
        m = (self.Prior.m * self.Prior.kappa + sum_wx) / kappa
        Mu_2 = m

        prior_kmmT = self.Prior.kappa * np.outer(self.Prior.m, self.Prior.m)
        post_kmmT = kappa * np.outer(m, m)
        B = sum_wxxT + self.Prior.B + prior_kmmT - post_kmmT
        Mu_1 = B / (self.Prior.nu + sum_w)

        assert Mu_1.ndim == 2
        assert Mu_1.shape == (
            D,
            D,
        )
        assert Mu_2.shape == (D, )
        return Mu_1, Mu_2
示例#7
0
    def calcSmoothedMu(self, X, W=None):
        ''' Compute smoothed estimate of mean of statistic xxT.

        Args
        ----
        X : 2D array, size N x D

        Returns
        -------
        Mu_1 : 2D array, size D
            Expected value of Var[ X[n,d] ]
        Mu_2 : 1D array, size D
            Expected value of Mean[ X[n] ]
        '''
        if X is None:
            Mu1 = self.Prior.beta / self.Prior.nu
            Mu2 = self.Prior.m
            return Mu1, Mu2

        if X.ndim == 1:
            X = X[np.newaxis, :]
        N, D = X.shape
        # Compute suff stats
        if W is None:
            sum_wxx = np.sum(np.square(X), axis=0)
            sum_wx = np.sum(X, axis=0)
            sum_w = X.shape[0]
        else:
            W = as1D(W)
            sum_wxx = np.dot(W, np.square(X))
            sum_wx = np.dot(W, X)
            sum_w = np.sum(W)

        post_kappa = self.Prior.kappa + sum_w
        post_m = (self.Prior.m * self.Prior.kappa + sum_wx) / post_kappa
        Mu_2 = post_m

        prior_kmm = self.Prior.kappa * (self.Prior.m * self.Prior.m)
        post_kmm = post_kappa * (post_m * post_m)
        post_beta = sum_wxx + self.Prior.beta + prior_kmm - post_kmm
        Mu_1 = post_beta / (self.Prior.nu + sum_w)

        assert Mu_1.ndim == 1
        assert Mu_1.shape == (D, )
        assert Mu_2.shape == (D, )
        return Mu_1, Mu_2
示例#8
0
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    X = Data.X
    D = Data.dim
    if 'resp' in LP:
        resp = LP['resp']
        K = resp.shape[1]
        # Compute expected outer-product statistic
        S_xxT = np.zeros((K, Data.dim, Data.dim))
        sqrtResp_k = np.sqrt(resp[:, 0])
        sqrtRX_k = sqrtResp_k[:, np.newaxis] * Data.X
        S_xxT[0] = dotATA(sqrtRX_k)
        for k in xrange(1, K):
            np.sqrt(resp[:, k], out=sqrtResp_k)
            np.multiply(sqrtResp_k[:, np.newaxis], Data.X, out=sqrtRX_k)
            S_xxT[k] = dotATA(sqrtRX_k)

        sqrtResp = np.sqrt(resp)
        xxT = np.zeros((K, D, D))
        for k in xrange(K):
            xxT[k] = dotATA(sqrtResp[:, k][:, np.newaxis] * Data.X)
        assert np.allclose(xxT, S_xxT)
    else:
        spR = LP['spR']
        K = spR.shape[1]
        # Compute expected outer-product statistic
        S_xxT = calcSpRXXT(X=X, spR_csr=spR)

    if SS is None:
        SS = SuffStatBag(K=K, D=D)
    # Expected outer-product for each state k
    SS.setField('xxT', S_xxT, dims=('K', 'D', 'D'))
    # Expected count for each k
    #  Usually computed by allocmodel. But sometimes not (eg TopicModel)
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')
    return SS
示例#9
0
def calcSummaryStats(Data, SS, LP, DataAtomType='doc', **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    if 'resp' in LP:
        K = LP['resp'].shape[1]
    else:
        K = LP['spR'].shape[1]
        nnzPerRow = LP['nnzPerRow']
    if SS is None:
        SS = SuffStatBag(K=K, D=Data.vocab_size)
    if DataAtomType == 'doc':
        # X : 2D sparse matrix, size nDoc x vocab_size
        X = Data.getSparseDocTypeCountMatrix()
        # WordCounts : 2D array, size K x vocab_size
        # obtained by sparse matrix multiply
        # here, '*' operator does this because X is sparse matrix type
        Nvec = None
        if 'resp' in LP:
            WordCounts = LP['resp'].T * X
            if not hasattr(SS, 'N'):
                Nvec = LP['resp'].sum(axis=0)
        else:
            WordCounts = (LP['spR'].T * X).toarray()
            if not hasattr(SS, 'N'):
                Nvec = as1D(toCArray(LP['spR'].sum(axis=0)))
        if Nvec is not None:
            SS.setField('N', Nvec, dims=('K'))
    else:
        # 2D sparse matrix, size V x N
        X = Data.getSparseTokenTypeCountMatrix()
        if 'resp' in LP:
            WordCounts = (X * LP['resp']).T  # matrix-matrix product
        else:
            WordCounts = (X * LP['spR']).T.toarray()
    SS.setField('WordCounts', WordCounts, dims=('K', 'D'))
    SS.setField('SumWordCounts', np.sum(WordCounts, axis=1), dims=('K'))
    return SS
    """
示例#10
0
    def setPostFactors(self,
                       obsModel=None,
                       SS=None,
                       LP=None,
                       Data=None,
                       nu=0,
                       B=0,
                       M=0,
                       V=0,
                       **kwargs):
        ''' Set Post attribute to provided values.
        '''
        self.ClearCache()
        if obsModel is not None:
            if hasattr(obsModel, 'Post'):
                self.Post = obsModel.Post.copy()
            else:
                self.setPostFromEstParams(obsModel.EstParams)
            self.K = self.Post.K
            return

        if LP is not None and Data is not None:
            SS = self.calcSummaryStats(Data, None, LP)

        if SS is not None:
            self.updatePost(SS)
        else:
            M = as3D(M)
            B = as3D(B)
            V = as3D(V)

            K, D, E = M.shape
            assert D == self.D
            assert E == self.E
            self.Post = ParamBag(K=K, D=self.D, E=self.E)
            self.Post.setField('nu', as1D(nu), dims=('K'))
            self.Post.setField('B', B, dims=('K', 'D', 'D'))
            self.Post.setField('M', M, dims=('K', 'D', 'E'))
            self.Post.setField('V', V, dims=('K', 'E', 'E'))
        self.K = self.Post.K
示例#11
0
    def calcSmoothedMu(self, X, W=None):
        ''' Compute smoothed estimate of mean of statistic xxT.

        Args
        ----
        X : 2D array, size N x D

        Returns
        -------
        Mu : 2D array, size D x D
        '''
        Prior_nu = self.Prior.nu - self.D - 1
        # Prior_nu = self.Prior.nu

        if X is None:
            Mu = self.Prior.B / (Prior_nu)
            return Mu
        if X.ndim == 1:
            X = X[np.newaxis, :]
        N, D = X.shape
        # Compute suff stats
        if W is None:
            sum_wxxT = np.dot(X.T, X)
            sum_w = X.shape[0]
        else:
            W = as1D(W)
            wX = np.sqrt(W)[:, np.newaxis] * X
            sum_wxxT = np.dot(wX.T, wX)
            sum_w = np.sum(W)
        Mu = (self.Prior.B + sum_wxxT) / (Prior_nu + sum_w)
        assert Mu.ndim == 2
        assert Mu.shape == (
            D,
            D,
        )
        return Mu
示例#12
0
def calcSummaryStats(Data, SS, LP, **kwargs):
    ''' Calculate summary statistics for given dataset and local parameters

    Returns
    --------
    SS : SuffStatBag object, with K components.
    '''
    X = Data.X
    if 'resp' in LP:
        resp = LP['resp']
        K = resp.shape[1]
        # 1/2: Compute mean statistic
        S_x = dotATB(resp, X)
        # 2/2: Compute expected outer-product statistic
        S_xx = calcRXX_withDenseResp(resp, X)
    else:
        spR = LP['spR']
        K = spR.shape[1]
        # 1/2: Compute mean statistic
        S_x = spR.T * X
        # 2/2: Compute expected outer-product statistic
        S_xx = calcSpRXX(X=X, spR_csr=spR)
    if SS is None:
        SS = SuffStatBag(K=K, D=Data.dim)
    # Expected mean for each state k
    SS.setField('x', S_x, dims=('K', 'D'))
    # Expected sum-of-squares for each state k
    SS.setField('xx', S_xx, dims=('K', 'D'))
    # Expected count for each k
    #  Usually computed by allocmodel. But sometimes not (eg TopicModel)
    if not hasattr(SS, 'N'):
        if 'resp' in LP:
            SS.setField('N', LP['resp'].sum(axis=0), dims='K')
        else:
            SS.setField('N', as1D(toCArray(LP['spR'].sum(axis=0))), dims='K')
    return SS
def createParamBagForPrior(Data=None,
                           D=0,
                           pnu=0,
                           ptau=None,
                           w_E=0,
                           P_EE=None,
                           P_diag_E=None,
                           P_diag_val=1.0,
                           Prior=None,
                           **kwargs):
    ''' Initialize Prior ParamBag attribute.

    Returns
    -------
    Prior : ParamBag
        with dimension attributes K, D, E
        with parameter attributes pnu, ptau, w_E, P_EE
    '''
    if Data is None:
        D = int(D)
    else:
        D = int(Data.dim)
    E = D + 1

    # Init parameters of 1D Wishart prior on delta
    pnu = np.maximum(pnu, 1e-9)
    ptau = np.maximum(ptau, 1e-9)

    # Initialize precision matrix of the weight vector
    if P_EE is not None:
        P_EE = np.asarray(P_EE)
    elif P_diag_E is not None:
        P_EE = np.diag(np.asarray(P_diag_E))
    else:
        P_EE = np.diag(P_diag_val * np.ones(E))
    assert P_EE.ndim == 2
    assert P_EE.shape == (E, E)

    # Initialize mean of the weight vector
    w_E = as1D(np.asarray(w_E))
    if w_E.size < E:
        w_E = np.tile(w_E, E)[:E]
    assert w_E.ndim == 1
    assert w_E.size == E

    if Prior is None:
        Prior = ParamBag(K=0, D=D, E=E)
    if not hasattr(Prior, 'E'):
        Prior.E = E
    assert Prior.D == D
    assert Prior.E == E
    Prior.setField('pnu', pnu, dims=None)
    Prior.setField('ptau', ptau, dims=None)
    Prior.setField('w_E', w_E, dims=('E'))
    Prior.setField('P_EE', P_EE, dims=('E', 'E'))

    Pw_E = np.dot(P_EE, w_E)
    wPw_1 = np.dot(w_E, Pw_E)
    Prior.setField('Pw_E', Pw_E, dims=('E'))
    Prior.setField('wPw_1', wPw_1, dims=None)
    return Prior
示例#14
0
def loadTopicModelFromTxtFiles(snapshotPath,
                               returnTPA=False,
                               returnWordCounts=False,
                               normalizeProbs=True,
                               normalizeTopics=True,
                               **kwargs):
    ''' Load from snapshot text files.

    Returns
    -------
    hmodel
    '''
    Mdict = dict()
    possibleKeys = [
        'K', 'probs', 'alpha', 'beta', 'lam', 'gamma', 'nTopics', 'nTypes',
        'vocab_size'
    ]
    keyMap = dict(beta='lam', nTopics='K', nTypes='vocab_size')
    for key in possibleKeys:
        try:
            arr = np.loadtxt(snapshotPath + "/%s.txt" % (key))
            if key in keyMap:
                Mdict[keyMap[key]] = arr
            else:
                Mdict[key] = arr
        except Exception:
            pass
    assert 'K' in Mdict
    assert 'lam' in Mdict
    K = int(Mdict['K'])
    V = int(Mdict['vocab_size'])

    if os.path.exists(snapshotPath + "/topics.txt"):
        Mdict['topics'] = np.loadtxt(snapshotPath + "/topics.txt")
        Mdict['topics'] = as2D(toCArray(Mdict['topics'], dtype=np.float64))
        assert Mdict['topics'].ndim == 2
        assert Mdict['topics'].shape == (K, V)
    else:
        TWC_data = np.loadtxt(snapshotPath + "/TopicWordCount_data.txt")
        TWC_inds = np.loadtxt(snapshotPath + "/TopicWordCount_indices.txt",
                              dtype=np.int32)
        if os.path.exists(snapshotPath + "/TopicWordCount_cscindptr.txt"):
            TWC_cscindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_cscindptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csc_matrix((TWC_data, TWC_inds, TWC_cscindptr),
                                          shape=(K, V))
        else:
            TWC_csrindptr = np.loadtxt(snapshotPath +
                                       "/TopicWordCount_indptr.txt",
                                       dtype=np.int32)
            TWC = scipy.sparse.csr_matrix((TWC_data, TWC_inds, TWC_csrindptr),
                                          shape=(K, V))

        Mdict['WordCounts'] = TWC.toarray()

    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        return topics, probs, alpha

    # BUILD HMODEL FROM LOADED TXT
    infAlg = 'VB'
    # avoids circular import
    from bnpy.HModel import HModel
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
示例#15
0
def loadTopicModel(matfilepath,
                   queryLap=None,
                   prefix=None,
                   returnWordCounts=0,
                   returnTPA=0,
                   normalizeTopics=0,
                   normalizeProbs=0,
                   **kwargs):
    ''' Load saved topic model

    Returns
    -------
    topics : 2D array, K x vocab_size (if returnTPA)
    probs : 1D array, size K (if returnTPA)
    alpha : scalar (if returnTPA)
    hmodel : HModel
    WordCounts : 2D array, size K x vocab_size (if returnWordCounts)
    '''
    if prefix is None:
        prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap)
    # avoids circular import
    from bnpy.HModel import HModel
    if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0:
        return loadTopicModelFromMEDLDA(matfilepath,
                                        prefix,
                                        returnTPA=returnTPA)

    snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot'))
    matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat'))
    if len(snapshotList) > 0:
        if prefix is None:
            snapshotList.sort()
            snapshotPath = snapshotList[-1]
        else:
            snapshotPath = None
            for curPath in snapshotList:
                if curPath.count(prefix):
                    snapshotPath = curPath
        return loadTopicModelFromTxtFiles(snapshotPath,
                                          normalizeTopics=normalizeTopics,
                                          normalizeProbs=normalizeProbs,
                                          returnWordCounts=returnWordCounts,
                                          returnTPA=returnTPA)

    if prefix is not None:
        matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat')
    Mdict = loadDictFromMatfile(matfilepath)
    if 'SparseWordCount_data' in Mdict:
        data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64)
        K = int(Mdict['K'])
        vocab_size = int(Mdict['vocab_size'])
        try:
            indices = Mdict['SparseWordCount_indices']
            indptr = Mdict['SparseWordCount_indptr']
            WordCounts = scipy.sparse.csr_matrix((data, indices, indptr),
                                                 shape=(K, vocab_size))
        except KeyError:
            rowIDs = Mdict['SparseWordCount_i'] - 1
            colIDs = Mdict['SparseWordCount_j'] - 1
            WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)),
                                                 shape=(K, vocab_size))
        Mdict['WordCounts'] = WordCounts.toarray()
    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        if 'eta' in Mdict:
            return topics, probs, alpha, as1D(toCArray(Mdict['eta']))
        return topics, probs, alpha

    infAlg = 'VB'
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel
示例#16
0
def plotGauss1DFromHModel(hmodel,
                          compListToPlot=None,
                          compsToHighlight=None,
                          activeCompIDs=None,
                          MaxKToDisplay=50,
                          proba_thr=0.0001,
                          ax_handle=None,
                          Colors=Colors,
                          dataset=None,
                          **kwargs):
    ''' Make line plot of pdf for each component (1D observations).
    '''
    if ax_handle is not None:
        pylab.sca(ax_handle)

    if compsToHighlight is not None:
        compsToHighlight = as1D(np.asarray(compsToHighlight))
    else:
        compsToHighlight = list()
    if compListToPlot is None:
        compListToPlot = np.arange(0, hmodel.obsModel.K)
    if activeCompIDs is None:
        activeCompIDs = np.arange(0, hmodel.obsModel.K)

    # Load appearance probabilities as single vector
    if hmodel.allocModel.K == hmodel.obsModel.K:
        w = hmodel.allocModel.get_active_comp_probs()
    else:
        w = np.ones(hmodel.obsModel.K)

    if dataset is not None:
        if hasattr(dataset, 'X'):
            pylab.hist(dataset.X[:, 0], 50, normed=1)
            #Xtile = np.tile(Data.X[:, 0], (2, 1))
            #ys = 0.1 * np.arange(2)
            #pylab.plot(Xtile, ys, 'k-')

    nSkip = 0
    nGood = 0
    for ii, compID in enumerate(compListToPlot):
        if compID not in activeCompIDs:
            continue

        kk = np.flatnonzero(activeCompIDs == compID)
        assert kk.size == 1
        kk = kk[0]

        if w[kk] < proba_thr and compID not in compsToHighlight:
            nSkip += 1
            continue

        mu = hmodel.obsModel.get_mean_for_comp(kk)
        sigma2 = hmodel.obsModel.get_covar_mat_for_comp(kk)

        if len(compsToHighlight) == 0 or compID in compsToHighlight:
            color = Colors[ii % len(Colors)]
            plotGauss1D(mu, sigma2, color=color)
        elif kk not in compsToHighlight:
            plotGauss1D(mu, sigma2, color='k')

        nGood += 1
        if nGood >= MaxKToDisplay:
            print( 'DISPLAY LIMIT EXCEEDED. Showing %d/%d components' \
                % (nGood, len(activeCompIDs)))
            break
    if nSkip > 0:
        print('SKIPPED %d comps with size below %.2f' % (nSkip, proba_thr))
示例#17
0
    def __init__(self,
                 X=None,
                 doc_range=None,
                 nDocTotal=None,
                 Xprev=None,
                 TrueZ=None,
                 TrueParams=None,
                 fileNames=None,
                 summary=None,
                 **kwargs):
        ''' Create an instance of GroupXData for provided array X

        Post Condition
        ---------
        self.X : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.Xprev : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.doc_range : 1D array, size nDoc+1
        '''
        self.X = as2D(toCArray(X, dtype=np.float64))
        self.doc_range = as1D(toCArray(doc_range, dtype=np.int32))
        if summary is not None:
            self.summary = summary
        if Xprev is not None:
            self.Xprev = as2D(toCArray(Xprev, dtype=np.float64))

        # Verify attributes are consistent
        self._set_dependent_params(doc_range, nDocTotal)
        self._check_dims()

        # Add optional true parameters / true hard labels
        if TrueParams is not None:
            self.TrueParams = dict()
            for key, arr in TrueParams.items():
                self.TrueParams[key] = toCArray(arr)

        if TrueZ is not None:
            if not hasattr(self, 'TrueParams'):
                self.TrueParams = dict()
            self.TrueParams['Z'] = as1D(toCArray(TrueZ))
            self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size

        # Add optional source files for each group/sequence
        if fileNames is not None:
            if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1):
                fileNames = fileNames[0, 0]
            if len(fileNames) > 1:
                self.fileNames = [
                    str(x).strip() for x in np.squeeze(fileNames)
                ]
            else:
                self.fileNames = [str(fileNames[0])]
        # Add extra data attributes custom for the dataset
        for key in kwargs:
            if hasattr(self, key):
                continue
            if not key.startswith("__"):
                arr = np.squeeze(as1D(kwargs[key]))
                if arr.shape == ():
                    try:
                        arr = float(arr)
                    except TypeError:
                        continue
                setattr(self, key, arr)
示例#18
0
    def __init__(self,
                 X=None,
                 nObsTotal=None,
                 TrueZ=None,
                 Xprev=None,
                 Y=None,
                 TrueParams=None,
                 name=None,
                 summary=None,
                 dtype='auto',
                 row_names=None,
                 column_names=None,
                 y_column_names=None,
                 xprev_column_names=None,
                 do_copy=True,
                 **kwargs):
        ''' Constructor for XData instance given in-memory dense array X.

        Post Condition
        ---------
        self.X : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        '''
        if dtype == 'auto':
            dtype = X.dtype
        if not do_copy and X.dtype == dtype:
            self.X = as2D(X)
        else:
            self.X = as2D(toCArray(X, dtype=dtype))

        if Xprev is not None:
            self.Xprev = as2D(toCArray(Xprev, dtype=dtype))
        if Y is not None:
            self.Y = as2D(toCArray(Y, dtype=dtype))

        # Verify attributes are consistent
        self._set_dependent_params(nObsTotal=nObsTotal)
        self._check_dims(do_copy=do_copy)

        # Add optional true parameters / true hard labels
        if TrueParams is not None:
            self.TrueParams = TrueParams
        if TrueZ is not None:
            if not hasattr(self, 'TrueParams'):
                self.TrueParams = dict()
            self.TrueParams['Z'] = as1D(toCArray(TrueZ))
            self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size
        if summary is not None:
            self.summary = summary
        if name is not None:
            self.name = str(name)

        # Add optional row names
        # this line is added by Tingting
        self.row_names = np.arange(0, self.nObs, 1)

        if row_names is None:
            self.row_names = np.arange(0, self.nObs, 1)
            ## map(str, range(self.nObs))
        else:
            assert len(list(self.row_names)) == self.nObs
            # self.row_names = map(str, row_names)

        # Add optional column names
        if column_names is None:
            self.column_names = map(lambda n: "dim_%d" % n, range(self.dim))
        else:
            assert len(column_names) == self.dim
            self.column_names = map(str, column_names)
示例#19
0
 def from_dict(self, Dict):
     self.inferType = Dict['inferType']
     self.K = Dict['K']
     self.rho = as1D(Dict['rho'])
     self.omega = as1D(Dict['omega'])