示例#1
0
    def __init__(self, X=None, doc_range=None, nDocTotal=None,
                 Xprev=None, TrueZ=None,
                 TrueParams=None, fileNames=None, summary=None, **kwargs):
        ''' Create an instance of GroupXData for provided array X

        Post Condition
        ---------
        self.X : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.Xprev : 2D array, size N x D
            with standardized dtype, alignment, byteorder.
        self.doc_range : 1D array, size nDoc+1
        '''
        self.X = as2D(toCArray(X, dtype=np.float64))
        self.doc_range = as1D(toCArray(doc_range, dtype=np.int32))
        if summary is not None:
            self.summary = summary
        if Xprev is not None:
            self.Xprev = as2D(toCArray(Xprev, dtype=np.float64))

        # Verify attributes are consistent
        self._set_dependent_params(doc_range, nDocTotal)
        self._check_dims()

        # Add optional true parameters / true hard labels
        if TrueParams is not None:
            self.TrueParams = dict()
            for key, arr in TrueParams.items():
                self.TrueParams[key] = toCArray(arr)

        if TrueZ is not None:
            if not hasattr(self, 'TrueParams'):
                self.TrueParams = dict()
            self.TrueParams['Z'] = as1D(toCArray(TrueZ))
            self.TrueParams['K'] = np.unique(self.TrueParams['Z']).size

        # Add optional source files for each group/sequence
        if fileNames is not None:
            if hasattr(fileNames, 'shape') and fileNames.shape == (1, 1):
                fileNames = fileNames[0, 0]
            if len(fileNames) > 1:
                self.fileNames = [str(x).strip()
                                  for x in np.squeeze(fileNames)]
            else:
                self.fileNames = [str(fileNames[0])]
        # Add extra data attributes custom for the dataset
        for key in kwargs:
            if hasattr(self, key):
                continue
            if not key.startswith("__"):
                arr = np.squeeze(as1D(kwargs[key]))
                if arr.shape == ():
                    try:
                        arr = float(arr)
                    except TypeError:
                        continue
                setattr(self, key, arr)
示例#2
0
    def createPrior(self, Data, nu=0, B=None, ECovMat=None, sF=1.0, **kwargs):
        ''' Initialize Prior ParamBag attribute.

        Post Condition
        ------
        Prior expected covariance matrix set to match provided value.
        '''
        D = self.D
        nu = np.maximum(nu, D + 2)
        if B is None:
            if ECovMat is None or isinstance(ECovMat, str):
                ECovMat = createECovMatFromUserInput(D, Data, ECovMat, sF)
            B = ECovMat * (nu - D - 1)
        else:
            B = as2D(B)
        self.Prior = ParamBag(K=0, D=D)
        self.Prior.setField('nu', nu, dims=None)
        self.Prior.setField('B', B, dims=('D', 'D'))
def SummaryAlg_cpp(initPi,
                   transPi,
                   SoftEv,
                   margPrObs,
                   fMsg,
                   bMsg,
                   mPairIDs=None,
                   order='C'):
    ''' Backward algorithm for a single HMM sequence. Implemented in C++/Eigen.
    '''
    if not hasEigenLibReady:
        raise ValueError("Cannot find library %s. Please recompile." %
                         (libfilename))
    if order != 'C':
        raise NotImplementedError("LibFwdBwd only supports row-major order.")

    # Prep inputs
    T, K = SoftEv.shape
    initPi = np.asarray(initPi, order=order)
    transPi = np.asarray(transPi, order=order)
    SoftEv = np.asarray(SoftEv, order=order)
    margPrObs = np.asarray(margPrObs, order=order)
    fMsg = np.asarray(fMsg, order=order)
    bMsg = np.asarray(bMsg, order=order)

    if mPairIDs is None or len(mPairIDs) == 0:
        M = 0
        mPairIDs = np.zeros((0, 2))
    else:
        mPairIDs = as2D(np.asarray(mPairIDs, dtype=np.float64))
        M = mPairIDs.shape[0]
    assert mPairIDs.shape[0] == M
    assert mPairIDs.shape[1] == 2

    # Allocate outputs
    TransStateCount = np.zeros((K, K), order=order)
    Htable = np.zeros((K, K), order=order)
    mHtable = np.zeros((2 * M, K), order=order)

    # Execute C++ code for backward pass (fills in bMsg in-place)
    lib.SummaryAlg(initPi, transPi, SoftEv, margPrObs, fMsg, bMsg,
                   TransStateCount, Htable, mPairIDs, mHtable, K, T, M)
    return TransStateCount, Htable, mHtable
示例#4
0
def loadTopicModel(matfilepath,
                   queryLap=None,
                   prefix=None,
                   returnWordCounts=0,
                   returnTPA=0,
                   normalizeTopics=0,
                   normalizeProbs=0,
                   **kwargs):
    ''' Load saved topic model

    Returns
    -------
    topics : 2D array, K x vocab_size (if returnTPA)
    probs : 1D array, size K (if returnTPA)
    alpha : scalar (if returnTPA)
    hmodel : HModel
    WordCounts : 2D array, size K x vocab_size (if returnWordCounts)
    '''
    if prefix is None:
        prefix, lapQuery = getPrefixForLapQuery(matfilepath, queryLap)
    # avoids circular import
    from bnpy.HModel import HModel
    if len(glob.glob(os.path.join(matfilepath, "*.log_prob_w"))) > 0:
        return loadTopicModelFromMEDLDA(matfilepath,
                                        prefix,
                                        returnTPA=returnTPA)

    snapshotList = glob.glob(os.path.join(matfilepath, 'Lap*TopicSnapshot'))
    matfileList = glob.glob(os.path.join(matfilepath, 'Lap*TopicModel.mat'))
    if len(snapshotList) > 0:
        if prefix is None:
            snapshotList.sort()
            snapshotPath = snapshotList[-1]
        else:
            snapshotPath = None
            for curPath in snapshotList:
                if curPath.count(prefix):
                    snapshotPath = curPath
        return loadTopicModelFromTxtFiles(snapshotPath,
                                          normalizeTopics=normalizeTopics,
                                          normalizeProbs=normalizeProbs,
                                          returnWordCounts=returnWordCounts,
                                          returnTPA=returnTPA)

    if prefix is not None:
        matfilepath = os.path.join(matfilepath, prefix + 'TopicModel.mat')
    Mdict = loadDictFromMatfile(matfilepath)
    if 'SparseWordCount_data' in Mdict:
        data = np.asarray(Mdict['SparseWordCount_data'], dtype=np.float64)
        K = int(Mdict['K'])
        vocab_size = int(Mdict['vocab_size'])
        try:
            indices = Mdict['SparseWordCount_indices']
            indptr = Mdict['SparseWordCount_indptr']
            WordCounts = scipy.sparse.csr_matrix((data, indices, indptr),
                                                 shape=(K, vocab_size))
        except KeyError:
            rowIDs = Mdict['SparseWordCount_i'] - 1
            colIDs = Mdict['SparseWordCount_j'] - 1
            WordCounts = scipy.sparse.csr_matrix((data, (rowIDs, colIDs)),
                                                 shape=(K, vocab_size))
        Mdict['WordCounts'] = WordCounts.toarray()
    if returnTPA:
        # Load topics : 2D array, K x vocab_size
        if 'WordCounts' in Mdict:
            topics = Mdict['WordCounts'] + Mdict['lam']
        else:
            topics = Mdict['topics']
        topics = as2D(toCArray(topics, dtype=np.float64))
        assert topics.ndim == 2
        K = topics.shape[0]
        if normalizeTopics:
            topics /= topics.sum(axis=1)[:, np.newaxis]

        # Load probs : 1D array, size K
        try:
            probs = Mdict['probs']
        except KeyError:
            probs = (1.0 / K) * np.ones(K)
        probs = as1D(toCArray(probs, dtype=np.float64))
        assert probs.ndim == 1
        assert probs.size == K
        if normalizeProbs:
            probs = probs / np.sum(probs)

        # Load alpha : scalar float > 0
        try:
            alpha = float(Mdict['alpha'])
        except KeyError:
            if 'alpha' in os.environ:
                alpha = float(os.environ['alpha'])
            else:
                raise ValueError('Unknown parameter alpha')
        if 'eta' in Mdict:
            return topics, probs, alpha, as1D(toCArray(Mdict['eta']))
        return topics, probs, alpha

    infAlg = 'VB'
    if 'gamma' in Mdict:
        aPriorDict = dict(alpha=Mdict['alpha'], gamma=Mdict['gamma'])
        HDPTopicModel = AllocModelConstructorsByName['HDPTopicModel']
        amodel = HDPTopicModel(infAlg, aPriorDict)
    else:
        FiniteTopicModel = AllocModelConstructorsByName['FiniteTopicModel']
        amodel = FiniteTopicModel(infAlg, dict(alpha=Mdict['alpha']))
    omodel = ObsModelConstructorsByName['Mult'](infAlg, **Mdict)
    hmodel = HModel(amodel, omodel)
    hmodel.set_global_params(**Mdict)
    if returnWordCounts:
        return hmodel, Mdict['WordCounts']
    return hmodel