Пример #1
0
def makeDataSliceFromSharedMem(dataShMemDict,
                               cslice=(0, None),
                               batchID=None):
    """ Create data slice from provided raw arrays and slice indicators.

    Returns
    -------
    Dslice : BagOfWordsData object
    """
    if batchID is not None and batchID in dataShMemDict:
        dataShMemDict = dataShMemDict[batchID]

    # Make local views (NOT copies) to shared mem arrays
    doc_range = sharedMemToNumpyArray(dataShMemDict['doc_range'])
    word_id = sharedMemToNumpyArray(dataShMemDict['word_id'])
    word_count = sharedMemToNumpyArray(dataShMemDict['word_count'])
    vocab_size = int(dataShMemDict['vocab_size'])

    if cslice is None:
        cslice = (0, doc_range.size - 1)
    elif cslice[1] is None:
        cslice = (0, doc_range.size - 1)

    tstart = doc_range[cslice[0]]
    tstop = doc_range[cslice[1]]
    Dslice = BagOfWordsData(
        vocab_size=vocab_size,
        doc_range=doc_range[cslice[0]:cslice[1] + 1] - doc_range[cslice[0]],
        word_id=word_id[tstart:tstop],
        word_count=word_count[tstart:tstop],
        nDoc=cslice[1] - cslice[0],
        )
    return Dslice
Пример #2
0
def makeDataSliceFromSharedMem(dataShMemDict, cslice=(0, None), batchID=None):
    """ Create data slice from provided raw arrays and slice indicators.

    Returns
    -------
    Dslice : namedtuple with same fields as XData object
        * X
        * n_examples
        * nObsTotal
        * n_dims
    Represents subset of documents identified by cslice tuple.

    Example
    -------
    >>> dataset = XData(np.random.rand(25,2))
    >>> shMemDict = dataset.getRawDataAsSharedMemDict()
    >>> cur_slice = makeDataSliceFromSharedMem(shMemDict)
    >>> np.allclose(dataset.X, cur_slice.X)
    True
    >>> np.allclose(dataset.nObs, cur_slice.nObs)
    True
    >>> dataset.dim == cur_slice.dim
    True
    >>> a_slice = makeDataSliceFromSharedMem(shMemDict, (0, 2))
    >>> a_slice.nObs
    2
    """
    if batchID is not None and batchID in dataShMemDict:
        dataShMemDict = dataShMemDict[batchID]

    # Make local views (NOT copies) to shared mem arrays
    X = sharedMemToNumpyArray(dataShMemDict['X'])
    nObsTotal = int(dataShMemDict['nObsTotal'])

    N, dim = X.shape
    if cslice is None:
        cslice = (0, N)
    elif cslice[1] is None:
        cslice = (0, N)

    keys = ['X', 'Xprev', 'n_examples', 'n_dims', 'nObsTotal']

    if 'Xprev' in dataShMemDict:
        Xprev = sharedMemToNumpyArray(
            dataShMemDict['Xprev'])[cslice[0]:cslice[1]]
    else:
        Xprev = None

    if 'Y' in dataShMemDict:
        Y = sharedMemToNumpyArray(dataShMemDict['Y'])[cslice[0]:cslice[1]]
    else:
        Y = None

    return XData(X=X[cslice[0]:cslice[1]],
                 Xprev=Xprev,
                 Y=Y,
                 n_examples=cslice[1] - cslice[0],
                 nObsTotal=nObsTotal,
                 do_copy=False)
Пример #3
0
    def run(self):
        self.printMsg("process SetUp! pid=%d" % (os.getpid()))

        # Construct iterator with sentinel value of None (for termination)
        jobIterator = iter(self.JobQueue.get, None)
        for jobArgs in jobIterator:
            sliceArgs, aArgs, oArgs = jobArgs
            aArgs.update(sharedMemDictToNumpy(self.aShMem))
            oArgs.update(sharedMemDictToNumpy(self.oShMem))

            start, stop = sliceArgs
            Dslice = self.makeDataSliceFromSharedMem(
                self.dataShMem, (start, stop))

            wc = sharedMemToNumpyArray(self.dataShMem['word_count'])
            self.printMsg("WCtotal=%d" % wc.sum())
            self.printMsg("WCslice=%d" % Dslice.word_count.sum())
            # Local step
            aLPkwargs = self.LPkwargs
            aLPkwargs.update(**aArgs)
            LP = self.o_calcLocalParams(Dslice, **oArgs)
            LP = self.a_calcLocalParams(Dslice, LP, **aLPkwargs)

            # Summary step
            SS = self.a_calcSummaryStats(Dslice, LP, **aArgs)
            SS = self.o_calcSummaryStats(Dslice, SS, LP, **oArgs)

            self.ResultQueue.put(SS)
            self.JobQueue.task_done()

        # Clean up
        self.printMsg("process CleanUp! pid=%d" % (os.getpid()))
Пример #4
0
    def fillSharedMemDictForLocalStep(self, ShMem=None):
        """ Get dict of shared mem arrays needed for parallel local step.

        Returns
        -------
        ShMem : dict of RawArray objects
        """
        ElogphiT, Elog1mphiT = self.GetCached('E_logphiT_log1mphiT', 'all')
        K = self.K
        if ShMem is None:
            ShMem = dict()
        if 'ElogphiT' not in ShMem:
            ShMem['ElogphiT'] = numpyToSharedMemArray(ElogphiT)
            ShMem['Elog1mphiT'] = numpyToSharedMemArray(Elog1mphiT)
        else:
            ElogphiT_shView = sharedMemToNumpyArray(ShMem['ElogphiT'])
            assert ElogphiT_shView.shape >= K
            ElogphiT_shView[:, :K] = ElogphiT

            Elog1mphiT_shView = sharedMemToNumpyArray(ShMem['Elog1mphiT'])
            assert Elog1mphiT_shView.shape >= K
            Elog1mphiT_shView[:, :K] = Elog1mphiT
        return ShMem
Пример #5
0
    def fillSharedMemDictForLocalStep(self, ShMem=None):
        """ Get dict of shared mem arrays needed for parallel local step.

        Returns
        -------
        ShMem : dict of RawArray objects
        """
        # No shared memory required here.
        if not isinstance(ShMem, dict):
            ShMem = dict()

        alphaEbeta = self.alpha_E_beta()
        if 'alphaEbeta' in ShMem:
            shared_alphaEbeta = sharedMemToNumpyArray(ShMem['alphaEbeta'])
            assert shared_alphaEbeta.size >= self.K
            shared_alphaEbeta[:alphaEbeta.size] = alphaEbeta
        else:
            ShMem['alphaEbeta'] = numpyToSharedMemArray(alphaEbeta.copy())
        return ShMem
Пример #6
0
def makeDataSliceFromSharedMem(dataShMemDict, cslice=(0, None), batchID=None):
    """ Create data slice from provided raw arrays and slice indicators.

    Returns
    -------
    Dslice : namedtuple with same fields as GroupXData object
        * X
        * nObs
        * nObsTotal
        * dim
    Represents subset of documents identified by cslice tuple.

    Example
    -------
    >>> Data = GroupXData(np.random.rand(25,2), doc_range=[0,4,12,25])
    >>> shMemDict = Data.getRawDataAsSharedMemDict()
    >>> Dslice = makeDataSliceFromSharedMem(shMemDict)
    >>> np.allclose(Data.X, Dslice.X)
    True
    >>> np.allclose(Data.nObs, Dslice.nObs)
    True
    >>> Data.dim == Dslice.dim
    True
    >>> Aslice = makeDataSliceFromSharedMem(shMemDict, (0, 2))
    >>> Aslice.nDoc
    2
    >>> np.allclose(Aslice.doc_range, Dslice.doc_range[0:(2+1)])
    True
    """
    if batchID is not None and batchID in dataShMemDict:
        dataShMemDict = dataShMemDict[batchID]

    # Make local views (NOT copies) to shared mem arrays
    doc_range = sharedMemToNumpyArray(dataShMemDict['doc_range'])
    X = sharedMemToNumpyArray(dataShMemDict['X'])
    nDocTotal = int(dataShMemDict['nDocTotal'])

    dim = X.shape[1]
    if cslice is None:
        cslice = (0, doc_range.size - 1)
    elif cslice[1] is None:
        cslice = (0, doc_range.size - 1)
    tstart = doc_range[cslice[0]]
    tstop = doc_range[cslice[1]]

    keys = ['X', 'Xprev', 'doc_range', 'nDoc', 'nObs', 'dim', 'nDocTotal']

    if 'Xprev' in dataShMemDict:
        Xprev = sharedMemToNumpyArray(dataShMemDict['Xprev'])[tstart:tstop]
    else:
        Xprev = None

    Dslice = namedtuple("GroupXDataTuple", keys)(
        X=X[tstart:tstop],
        Xprev=Xprev,
        doc_range=doc_range[cslice[0]:cslice[1] + 1] - doc_range[cslice[0]],
        nDoc=cslice[1] - cslice[0],
        nObs=tstop - tstart,
        dim=dim,
        nDocTotal=nDocTotal,
    )
    return Dslice