def testSparseMatrix(self): m = 10 n = 15 A = numpy.random.rand(m, n) rowInds, colInds = A.nonzero() vals = A[rowInds, colInds] X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="col") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== scipy.sparse.csc_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="row") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== scipy.sparse.csr_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="col") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== sppy.csarray) self.assertTrue(X.storagetype=="col") nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="row") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== sppy.csarray) self.assertTrue(X.storagetype=="row") nptst.assert_array_equal(X.toarray(), A)
def testSparseMatrix(self): m = 10 n = 15 A = numpy.random.rand(m, n) rowInds, colInds = A.nonzero() vals = A[rowInds, colInds] X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="col") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == scipy.sparse.csc_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="row") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == scipy.sparse.csr_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="col") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == sppy.csarray) self.assertTrue(X.storagetype == "col") nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="row") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == sppy.csarray) self.assertTrue(X.storagetype == "row") nptst.assert_array_equal(X.toarray(), A)
def shuffleSplitRows(X, k, testSize, numRows=None, csarray=True, rowMajor=True, colProbs=None): """ Take a sparse binary matrix and create k number of train-test splits in which the test split contains at most testSize elements and the train split contains the remaining elements from X for each row. The splits are computed randomly. Returns sppy.csarray objects by default. :param colProbs: This is the probability of choosing the corresponding column/item. If None, we assume uniform probabilities. """ if csarray: mattype = "csarray" else: mattype = "scipy" if rowMajor: storagetype = "row" else: storagetype = "col" if numRows == None: numRows = X.shape[0] outputRows = False else: outputRows = True trainTestXList = [] omegaList = SparseUtils.getOmegaList(X) m, n = X.shape for i in range(k): trainInd = 0 testInd = 0 trainRowInds = numpy.zeros(X.nnz, numpy.int32) trainColInds = numpy.zeros(X.nnz, numpy.int32) testRowInds = numpy.zeros(X.shape[0]*testSize, numpy.int32) testColInds = numpy.zeros(X.shape[0]*testSize, numpy.int32) rowSample = numpy.sort(numpy.random.choice(m, numRows, replace=False)) for j in range(m): if j in rowSample: if colProbs == None: inds = numpy.random.permutation(omegaList[j].shape[0]) else: probs = colProbs[omegaList[j]] probs /= probs.sum() inds = numpy.random.choice(omegaList[j].shape[0], omegaList[j].shape[0], p=probs, replace=False) trainInds = inds[testSize:] testInds = inds[0:testSize] else: trainInds = numpy.arange(omegaList[j].shape[0]) testInds = numpy.array([], numpy.int) trainRowInds[trainInd:trainInd+trainInds.shape[0]] = numpy.ones(trainInds.shape[0], dtype=numpy.uint)*j trainColInds[trainInd:trainInd+trainInds.shape[0]] = omegaList[j][trainInds] trainInd += trainInds.shape[0] testRowInds[testInd:testInd+testInds.shape[0]] = numpy.ones(testInds.shape[0], dtype=numpy.uint)*j testColInds[testInd:testInd+testInds.shape[0]] = omegaList[j][testInds] testInd += testInds.shape[0] trainRowInds = trainRowInds[0:trainInd] trainColInds = trainColInds[0:trainInd] testRowInds = testRowInds[0:testInd] testColInds = testColInds[0:testInd] trainX = SparseUtils.sparseMatrix(numpy.ones(trainRowInds.shape[0], numpy.int), trainRowInds, trainColInds, X.shape, mattype, storagetype) testX = SparseUtils.sparseMatrix(numpy.ones(testRowInds.shape[0], numpy.int), testRowInds, testColInds, X.shape, mattype, storagetype) if not outputRows: trainTestXList.append((trainX, testX)) else: trainTestXList.append((trainX, testX, rowSample)) return trainTestXList
def shuffleSplitRows(X, k, testSize, numRows=None, csarray=True, rowMajor=True, colProbs=None): """ Take a sparse binary matrix and create k number of train-test splits in which the test split contains at most testSize elements and the train split contains the remaining elements from X for each row. The splits are computed randomly. Returns sppy.csarray objects by default. :param colProbs: This is the probability of choosing the corresponding column/item. If None, we assume uniform probabilities. """ if csarray: mattype = "csarray" else: mattype = "scipy" if rowMajor: storagetype = "row" else: storagetype = "col" if numRows == None: numRows = X.shape[0] outputRows = False else: outputRows = True trainTestXList = [] omegaList = SparseUtils.getOmegaList(X) m, n = X.shape for i in range(k): trainInd = 0 testInd = 0 trainRowInds = numpy.zeros(X.nnz, numpy.int32) trainColInds = numpy.zeros(X.nnz, numpy.int32) testRowInds = numpy.zeros(X.shape[0] * testSize, numpy.int32) testColInds = numpy.zeros(X.shape[0] * testSize, numpy.int32) rowSample = numpy.sort( numpy.random.choice(m, numRows, replace=False)) for j in range(m): if j in rowSample: if colProbs == None: inds = numpy.random.permutation(omegaList[j].shape[0]) else: probs = colProbs[omegaList[j]] probs /= probs.sum() inds = numpy.random.choice(omegaList[j].shape[0], omegaList[j].shape[0], p=probs, replace=False) trainInds = inds[testSize:] testInds = inds[0:testSize] else: trainInds = numpy.arange(omegaList[j].shape[0]) testInds = numpy.array([], numpy.int) trainRowInds[trainInd:trainInd + trainInds.shape[0]] = numpy.ones( trainInds.shape[0], dtype=numpy.uint) * j trainColInds[trainInd:trainInd + trainInds.shape[0]] = omegaList[j][trainInds] trainInd += trainInds.shape[0] testRowInds[testInd:testInd + testInds.shape[0]] = numpy.ones( testInds.shape[0], dtype=numpy.uint) * j testColInds[testInd:testInd + testInds.shape[0]] = omegaList[j][testInds] testInd += testInds.shape[0] trainRowInds = trainRowInds[0:trainInd] trainColInds = trainColInds[0:trainInd] testRowInds = testRowInds[0:testInd] testColInds = testColInds[0:testInd] trainX = SparseUtils.sparseMatrix( numpy.ones(trainRowInds.shape[0], numpy.int), trainRowInds, trainColInds, X.shape, mattype, storagetype) testX = SparseUtils.sparseMatrix( numpy.ones(testRowInds.shape[0], numpy.int), testRowInds, testColInds, X.shape, mattype, storagetype) if not outputRows: trainTestXList.append((trainX, testX)) else: trainTestXList.append((trainX, testX, rowSample)) return trainTestXList