Пример #1
0
    def _initialise(self, corpus):
        #Four arrays need to be initialed
        #Topic-Vocabulary, Topic, Document-Topic, Document
        #change a little bit -- yr

        self.D, self.V = D, V = corpus.shape
        n_t = self.num_topic

        self._n_zw = np.zeros((n_t, V), dtype=np.intc)
        self._n_z = np.zeros((n_t), dtype=np.intc)
        self._n_dz = np.zeros((D, n_t), dtype=np.intc)
        self._n_d = np.zeros((D), dtype=np.intc)

        self._n = n = int(corpus.sum())
        self.d_list, self.w_list = util.array2list(corpus)
        self.z_list = []
        self.log_likelihood = np.zeros(self.num_iter, dtype=float)

        for i in range(n):
            d = self.d_list[i]
            w = self.w_list[i]
            #get topic assignment randomly
            z = np.random.randint(0, n_t, dtype=np.intc)
            self.z_list.append(z)

            self._n_zw[z][w] += 1  # increment topic-word count
            self._n_z[z] += 1  # increment topic-word sum
            self._n_dz[d][z] += 1  # increment doc-topic count
            self._n_d[d] += 1  # increment doc-topic sum
Пример #2
0
    def compress(self):
        if self.ncols is None or len(self.data) == 0:
            return

        if self.compute_GP:
            t0 = time.time()
            G = np.random.randn(self.projsize, len(self.data)) / 100.
            A_flush = G * np.mat(self.data)
            dt = time.time() - t0
            self.counters['numpy time (millisecs)'] += int(1000 * dt)

            # Add flushed update to local copy
            if self.A_curr == None:
                self.A_curr = A_flush
            else:
                self.A_curr += A_flush

        if self.compute_QR:
            t0 = time.time()
            R = self.QR()
            dt = time.time() - t0
            self.counters['numpy time (millisecs)'] += int(1000 * dt)
            # reset data and re-initialize to R
            self.qr_data = []
            for row in R:
                self.qr_data.append(util.array2list(row))

        self.data = []
Пример #3
0
    def compress(self):
        # Compute a QR factorization on the data accumulated so far.
        t0 = time.time()
        R = self.QR()
        dt = time.time() - t0
        self.counters['numpy time (millisecs)'] += int(1000 * dt)

        # reset data and re-initialize to R
        self.data = []
        self.A_data = []
        for row in R:
            self.data.append(util.array2list(row))
Пример #4
0
    def compress(self):
        # Compute a QR factorization on the data accumulated so far.
        if self.ncols == None or len(self.data) < self.ncols:
            return

        t0 = time.time()
        R = self.QR()
        dt = time.time() - t0

        # reset data and re-initialize to R
        self.data = []
        for row in R:
            self.data.append(util.array2list(row))
Пример #5
0
    def compress(self):
        # Compute a QR factorization on the data accumulated so far.
        if self.ncols is None or len(self.data) < self.ncols:
            return

        t0 = time.time()
        R = self.QR()
        dt = time.time() - t0
        self.counters['numpy time (millisecs)'] += int(1000 * dt)

        # reset data and re-initialize to R
        self.data = []
        for row in R:
            self.data.append(util.array2list(row))
Пример #6
0
    def close(self):
        self.counters['rows processed'] += self.nrows % 50000
        self.compress()

        if self.compute_GP:
            if self.A_curr != None:
                for ind, row in enumerate(self.A_curr.getA()):
                    yield ('GP', ind), util.array2list(row)

        if self.compute_QR:
            for i, row in enumerate(self.qr_data):
                key = np.random.randint(0, 4000000000)
                yield ('QR', key), row

        if self.compute_colnorms and self.colnorms != None:
            for ind, val in enumerate(self.colnorms):
                yield ('colnorms', ind), val
Пример #7
0
    def output(self, final=False):
        if final or len(self.data) >= self.blocksize * self.ncols:
            self.counters['Blocks Output'] += 1
            # compress the data

            if self.ncols is None:
                return

            t0 = time.time()
            A = numpy.array(self.data)
            U = self.compute_U(A)
            dt = time.time() - t0
            self.counters['numpy time (millisecs)'] += int(1000 * dt)

            assert (U.shape[0] == len(self.keys))

            for i, row in enumerate(U):
                yield self.keys[i], util.array2list(row)

            self.data = []
            self.keys = []
Пример #8
0
 def output(self,final=False):
     if final or len(self.data)>=self.blocksize*self.ncols:
         self.counters['Blocks Output'] += 1
         # compress the data
         
         if self.ncols is None:
             return
             
         t0 = time.time()
         A = numpy.array(self.data)
         U = self.compute_U(A)
         dt = time.time() - t0
         self.counters['numpy time (millisecs)'] += int(1000*dt)
         
         assert(U.shape[0] == len(self.keys))
         
         
         for i,row in enumerate(U):
             yield self.keys[i], util.array2list(row)
             
         self.data = []
         self.keys = []
Пример #9
0
 def close(self):
     self.counters['rows processed'] += self.nrows % 50000
     self.compress()
     if self.A_curr is not None:
         for ind, row in enumerate(self.A_curr.getA()):
             yield ind, util.array2list(row)
Пример #10
0
 def close(self):
     self.counters['rows processed'] += self.nrows % 50000
     self.compress()
     if self.A_curr is not None:
         for ind, row in enumerate(self.A_curr.getA()):
             yield ind, util.array2list(row)