def centerRows(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each row nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0]) inds = nonZeroCounts == 0 nonZeroCounts += inds #This is required because when we do X.sum(1) for centering it uses the same #dtype as X to store the sum, and this can result in overflow for e.g. uint8 if X.dtype == numpy.uint8: sumCol = SparseUtilsCython.sumCols( rowInds, numpy.array(X[rowInds, colInds]).flatten(), X.shape[0]) else: sumCol = numpy.array(X.sum(1)).flatten() mu = sumCol / nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def centerRows(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each row nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0]) inds = nonZeroCounts==0 nonZeroCounts += inds #This is required because when we do X.sum(1) for centering it uses the same #dtype as X to store the sum, and this can result in overflow for e.g. uint8 if X.dtype == numpy.uint8: sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(X[rowInds, colInds]).flatten(), X.shape[0]) else: sumCol = numpy.array(X.sum(1)).flatten() mu = sumCol/nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def testSumCols(self): A = scipy.sparse.rand(10, 15, 0.5)*10 A = scipy.sparse.csc_matrix(A, dtype=numpy.uint8) rowInds, colInds = A.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(A[rowInds, colInds]).flatten(), A.shape[0]) nptst.assert_array_equal(numpy.array(A.sum(1)).flatten(), sumCol)