def testSampleUsers(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = 50 X2, userInds = Sampling.sampleUsers(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = numpy.random.randint(10, 100) X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers(X, k) self.assertEquals(X2.shape[0], min(k, m)) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def testLocalAUC(self): m = 10 n = 20 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, 0.5, verbose=True, csarray=True) Z = U.dot(V.T) localAuc = numpy.zeros(m) for i in range(m): localAuc[i] = sklearn.metrics.roc_auc_score(numpy.ravel(X[i, :].toarray()), Z[i, :]) localAuc = localAuc.mean() u = 0.0 localAuc2 = MCEvaluator.localAUC(X, U, V, u) self.assertEquals(localAuc, localAuc2) # Now try a large r w = 1.0 localAuc2 = MCEvaluator.localAUC(X, U, V, w) self.assertEquals(localAuc2, 0)
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix( (m, n), k, w, csarray=True, verbose=True, indsPerRow=200 ) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt # plt.plot(fpr, tpr) # plt.show() # Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def testLocalAUC(self): m = 10 n = 20 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, 0.5, verbose=True, csarray=True) Z = U.dot(V.T) localAuc = numpy.zeros(m) for i in range(m): localAuc[i] = sklearn.metrics.roc_auc_score( numpy.ravel(X[i, :].toarray()), Z[i, :]) localAuc = localAuc.mean() u = 0.0 localAuc2 = MCEvaluator.localAUC(X, U, V, u) self.assertEquals(localAuc, localAuc2) #Now try a large r w = 1.0 localAuc2 = MCEvaluator.localAUC(X, U, V, w) self.assertEquals(localAuc2, 0)
def testOverfit(self): """ See if we can get a zero objective on the hinge loss """ m = 10 n = 20 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 k = 10 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "constant" maxLocalAuc.maxIterations = 500 maxLocalAuc.numProcesses = 1 maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0 maxLocalAuc.lmbda = 0 print("Overfit example") U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel( X, verbose=True) self.assertAlmostEquals(trainMeasures[-1, 0], 0, 3)
def testParallelLearnModel(self): numpy.random.seed(21) m = 500 n = 200 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) from wallhack.rankingexp.DatasetUtils import DatasetUtils X, U, V = DatasetUtils.syntheticDataset1() u = 0.1 w = 1-u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=1.0, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 3 maxLocalAuc.recordStep = 1 maxLocalAuc.rate = "optimal" maxLocalAuc.t0 = 2.0 maxLocalAuc.validationUsers = 0.0 maxLocalAuc.numProcesses = 4 os.system('taskset -p 0xffffffff %d' % os.getpid()) print(X.nnz/maxLocalAuc.numAucSamples) U, V = maxLocalAuc.parallelLearnModel(X)
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt #plt.plot(fpr, tpr) #plt.show() #Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def testOverfit(self): """ See if we can get a zero objective on the hinge loss """ m = 10 n = 20 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 k = 10 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "constant" maxLocalAuc.maxIterations = 500 maxLocalAuc.numProcesses = 1 maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0 maxLocalAuc.lmbda = 0 print("Overfit example") U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(X, verbose=True) self.assertAlmostEquals(trainMeasures[-1, 0], 0, 3)
def testModelSelect(self): m = 50 n = 50 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) os.system('taskset -p 0xffffffff %d' % os.getpid()) u = 0.2 lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.maxIterations = 2 learner.ks = 2**numpy.arange(3, 5) learner.lmbdaUsers = 2.0**-numpy.arange(1, 3) learner.lmbdaPoses = 2.0**-numpy.arange(1, 3) learner.lmbdaNegs = 2.0**-numpy.arange(1, 3) learner.gammas = 2.0**-numpy.arange(1, 3) learner.folds = 2 learner.numProcesses = 1 colProbs = numpy.array(X.sum(1)).ravel() colProbs /= colProbs.sum() print(colProbs, colProbs.shape) learner.modelSelect(X, colProbs=colProbs)
def testModelSelect(self): m = 50 n = 50 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) os.system('taskset -p 0xffffffff %d' % os.getpid()) u = 0.2 lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.maxIterations = 2 learner.ks = 2**numpy.arange(3, 5) learner.lmbdaUsers = 2.0**-numpy.arange(1, 3) learner.lmbdaPoses = 2.0**-numpy.arange(1, 3) learner.lmbdaNegs = 2.0**-numpy.arange(1, 3) learner.gammas = 2.0**-numpy.arange(1, 3) learner.folds = 2 learner.numProcesses = 1 colProbs = numpy.array(X.sum(1)).ravel() colProbs /= colProbs.sum() print(colProbs, colProbs.shape) learner.modelSelect(X, colProbs=colProbs)
def testLocalAucApprox(self): m = 100 n = 200 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) w = 1.0 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(150, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1) # Try smaller w w = 0.5 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(50, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1)
def testLocalAucApprox(self): m = 100 n = 200 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) w = 1.0 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(150, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1) #Try smaller w w = 0.5 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(50, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1)
def testF1Atk(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) self.assertAlmostEquals( MCEvaluator.f1AtK(X, orderedItems, n, verbose=False), 2 * r / float(n) / (1 + r / float(n)) ) m = 20 n = 50 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) k = 5 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2) # Test case where we get a zero precision or recall orderedItems[5, :] = -1 precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): if precision[i] + recall[i] != 0: f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2)
def __init__(self): numpy.random.seed(21) # Create a low rank matrix self.m = 1000 self.n = 5000 self.k = 10 self.X = SparseUtils.generateSparseBinaryMatrix((self.m, self.n), self.k, csarray=True)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix m = 500 n = 200 self.k = 8 self.X = SparseUtils.generateSparseBinaryMatrix((m, n), self.k, csarray=True) print(self.X)
def testSampleUsers2(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = X.nnz+100 X2, userInds = Sampling.sampleUsers2(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) #Test pruning of cols k = 500 m = 100 n = 500 u = 0.1 w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=True) nnz1 = X2.nnz self.assertTrue((X2.sum(0)!=0).all()) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=False) nnz2 = X2.nnz self.assertEquals(nnz1, nnz2) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = 500 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers2(X, k) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def testScale(self): """ Look at the scales of the unnormalised gradients. """ m = 100 n = 400 k = 3 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) w = 0.1 eps = 0.001 learner = MaxAUCTanh(k, w) learner.normalise = False learner.lmbdaU = 1.0 learner.lmbdaV = 1.0 learner.rho = 1.0 learner.numAucSamples = 100 indPtr, colInds = SparseUtils.getOmegaListPtr(X) r = numpy.random.rand(m) U = numpy.random.rand(X.shape[0], k) V = numpy.random.rand(X.shape[1], k) gi = numpy.random.rand(m) gi /= gi.sum() gp = numpy.random.rand(n) gp /= gp.sum() gq = numpy.random.rand(n) gq /= gq.sum() permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32) maxLocalAuc = MaxLocalAUC(k, w) normGp, normGq = maxLocalAuc.computeNormGpq(indPtr, colInds, gp, gq, m) normDui = 0 for i in range(m): du = learner.derivativeUi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDui += numpy.linalg.norm(du) normDui /= float(m) print(normDui) normDvi = 0 for i in range(n): dv = learner.derivativeVi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDvi += numpy.linalg.norm(dv) normDvi /= float(n) print(normDvi)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix self.m = 1000 self.n = 5000 self.k = 10 self.X = SparseUtils.generateSparseBinaryMatrix((self.m, self.n), self.k, csarray=True)
def testScale(self): """ Look at the scales of the unnormalised gradients. """ m = 100 n = 400 k = 3 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) w = 0.1 eps = 0.001 learner = MaxAUCTanh(k, w) learner.normalise = False learner.lmbdaU = 1.0 learner.lmbdaV = 1.0 learner.rho = 1.0 learner.numAucSamples = 100 indPtr, colInds = SparseUtils.getOmegaListPtr(X) r = numpy.random.rand(m) U = numpy.random.rand(X.shape[0], k) V = numpy.random.rand(X.shape[1], k) gi = numpy.random.rand(m) gi /= gi.sum() gp = numpy.random.rand(n) gp /= gp.sum() gq = numpy.random.rand(n) gq /= gq.sum() permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32) maxLocalAuc = MaxLocalAUC(k, w) normGp, normGq = maxLocalAuc.computeNormGpq(indPtr, colInds, gp, gq, m) normDui = 0 for i in range(m): du = learner.derivativeUi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDui += numpy.linalg.norm(du) normDui /= float(m) print(normDui) normDvi = 0 for i in range(n): dv = learner.derivativeVi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDvi += numpy.linalg.norm(dv) normDvi /= float(n) print(normDvi)
def setUp(self): numpy.set_printoptions(precision=3, suppress=True) numpy.random.seed(21) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) self.m = 30 self.n = 20 k = 5 u = 0.1 w = 1-u self.X = SparseUtils.generateSparseBinaryMatrix((self.m, self.n), k, w, csarray=True) self.X.prune()
def __init__(self): numpy.random.seed(21) #Create a low rank matrix m = 500 n = 200 self.k = 8 self.X = SparseUtils.generateSparseBinaryMatrix((m, n), self.k, csarray=True) os.system('taskset -p 0xffffffff %d' % os.getpid())
def profileShuffleSplitRows(self): m = 10000 n = 5000 k = 5 u = 0.1 w = 1-u X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) k2 = 10 testSize = 2 ProfileUtils.profile('Sampling.shuffleSplitRows(X, k2, testSize)', globals(), locals())
def testPrecisionAtK(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) #print(MCEvaluator.precisionAtK(X, U*s, V, 2)) orderedItems = MCEvaluator.recommendAtk(U, V, n) self.assertAlmostEquals(MCEvaluator.precisionAtK(X, orderedItems, n), X.nnz / float(m * n)) k = 2 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) precisions = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] precisions[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(k) self.assertEquals(precision.mean(), precisions.mean()) #Now try random U and V U = numpy.random.rand(m, 3) V = numpy.random.rand(m, 3) orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) precisions = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] precisions[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(k) self.assertEquals(precision.mean(), precisions.mean())
def testRecallAtK(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) orderedItems = MCEvaluator.recommendAtk(U, V, n) self.assertAlmostEquals(MCEvaluator.recallAtK(X, orderedItems, n), 1.0) k = 2 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) recalls = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] recalls[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float( nonzeroRow.shape[0]) self.assertEquals(recall.mean(), recalls.mean()) #Now try random U and V U = numpy.random.rand(m, 3) V = numpy.random.rand(m, 3) orderedItems = MCEvaluator.recommendAtk(U, V, k) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) recalls = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] recalls[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float( nonzeroRow.shape[0]) self.assertEquals(recall.mean(), recalls.mean())
def profileSampleUsers(self): m = 10000 n = 50000 k = 5 u = 0.01 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) print(X.nnz) k2 = 100000 ProfileUtils.profile('Sampling.sampleUsers2(X, k2)', globals(), locals())
def testGenerateSparseBinaryMatrix(self): m = 5 n = 10 k = 3 quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) Xscipy = numpy.array(X.todense()) nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) self.assertTrue(numpy.linalg.norm(X - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.todense(), numpy.ones((m,n))) quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) Xcsarray = X.toarray() nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) self.assertTrue(numpy.linalg.norm(X.toarray() - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.toarray(), numpy.ones((m,n))) nptst.assert_array_equal(Xcsarray, Xscipy) #Test variation in the quantiles w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.1, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2) #Test a larger standard deviation w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.5, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>=r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2)
def syntheticDataset1(m=500, n=200, k=8, u=0.1, sd=0, noise=5): """ Create a simple synthetic dataset """ w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=sd, csarray=True, verbose=True, indsPerRow=200) X = X + sppy.rand((m, n), noise/float(n), storagetype="row") X[X.nonzero()] = 1 X.prune() X = SparseUtils.pruneMatrixRows(X, minNnzRows=10) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) U = U*s return X, U, V
def testModelSelect(self): m = 50 n = 20 k = 5 u = 0.1 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) lmbda = 0.1 gamma = 0.01 learner = CLiMF(k, lmbda, gamma) learner.max_iters = 10 learner.modelSelect(X)
def testLearnModel(self): m = 50 n = 20 k = 5 u = 0.1 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.max_iters = 50 learner.learnModel(X) Z = learner.predict(n)
def testLearnModel(self): m = 50 n = 20 k = 5 u = 0.1 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.max_iters = 50 learner.learnModel(X) Z = learner.predict(n)
def testLearningRateSelect(self): m = 10 n = 20 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "optimal" maxLocalAuc.maxIterations = 5 maxLocalAuc.numProcesses = 1 maxLocalAuc.learningRateSelect(X)
def testLearningRateSelect(self): m = 10 n = 20 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "optimal" maxLocalAuc.maxIterations = 5 maxLocalAuc.numProcesses = 1 maxLocalAuc.learningRateSelect(X)
def testAverageAuc(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix( (m, n), k, w, csarray=True, verbose=True, indsPerRow=200 ) auc = MCEvaluator.averageAuc(X, U, V) u = 0.0 auc2 = MCEvaluator.localAUC(X, U, V, u) self.assertAlmostEquals(auc, auc2)
def testLocalAucApprox2(self): m = 100 n = 200 k = 5 numInds = 100 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) r = numpy.ones(m) * -10 w = 0.5 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(50, 200, 10) for i, sampleSize in enumerate(samples): localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, sampleSize) self.assertAlmostEqual(localAuc2, localAuc, 1) #Test more accurately sampleSize = 1000 localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, sampleSize) self.assertAlmostEqual(localAuc2, localAuc, 2) #Now set a high r Z = U.dot(V.T) localAuc = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, sampleSize) for i, sampleSize in enumerate(samples): localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, sampleSize) self.assertAlmostEqual(localAuc2, localAuc, 1) #Test more accurately sampleSize = 1000 localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, sampleSize) self.assertAlmostEqual(localAuc2, localAuc, 2)
def testRecommendAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) k = 10 orderedItems, scores = MCEvaluator.recommendAtk(U, V, k, verbose=True) #Now do it manually Z = U.dot(V.T) orderedItems2 = Util.argmaxN(Z, k) scores2 = numpy.fliplr(numpy.sort(Z, 1))[:, 0:k] nptst.assert_array_equal(orderedItems, orderedItems2) nptst.assert_array_equal(scores, scores2) #Test case where we have a set of training indices to remove #Let's create a random omegaList omegaList = [] for i in range(m): omegaList.append(numpy.random.permutation(n)[0:5]) orderedItems = MCEvaluator.recommendAtk(U, V, k, omegaList=omegaList) orderedItems2 = MCEvaluator.recommendAtk(U, V, k) #print(omegaList) #print(orderedItems) #print(orderedItems2) for i in range(m): items = numpy.intersect1d(omegaList[i], orderedItems[i, :]) self.assertEquals(items.shape[0], 0) items = numpy.union1d(omegaList[i], orderedItems[i, :]) items = numpy.intersect1d(items, orderedItems2[i, :]) nptst.assert_array_equal(items, numpy.sort(orderedItems2[i, :]))
def testReciprocalRankAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True, csarray=True) k = 5 orderedItems = numpy.random.randint(0, n, m * k) orderedItems = numpy.reshape(orderedItems, (m, k)) orderedItems = numpy.array(orderedItems, numpy.int32) (indPtr, colInds) = X.nonzeroRowsPtr() indPtr = numpy.array(indPtr, numpy.uint32) colInds = numpy.array(colInds, numpy.uint32) rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) rrs2 = numpy.zeros(m) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] for j in range(k): if orderedItems[i, j] in omegai: rrs2[i] = 1 / float(1 + j) break nptst.assert_array_equal(rrs, rrs2) #Test case where no items are in ranking orderedItems = numpy.ones((m, k), numpy.int32) * (n + 1) rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(rrs, numpy.zeros(m)) #Now, make all items rank 2 for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] orderedItems[i, 1] = omegai[0] rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(rrs, numpy.ones(m) * 0.5)
def profileShuffleSplitRows(self): m = 10000 n = 5000 k = 5 u = 0.1 w = 1 - u X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) k2 = 10 testSize = 2 ProfileUtils.profile('Sampling.shuffleSplitRows(X, k2, testSize)', globals(), locals())
def testModelSelect(self): m = 50 n = 50 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) os.system('taskset -p 0xffffffff %d' % os.getpid()) learner = WeightedMf(k) learner.maxIterations = 10 learner.ks = 2**numpy.arange(3, 5) learner.folds = 2 #maxLocalAuc.numProcesses = 1 learner.modelSelect(X)
def testLearnModel(self): m = 50 n = 20 k = 5 u = 0.1 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) lmbda = 0.1 gamma = 0.01 learner = CLiMF(k, lmbda, gamma) learner.max_iters = 50 learner.learnModel(X) Z = learner.predict(n) #Bit weird that all rows are the same print(Z)
def testRecommendAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) k = 10 X = numpy.zeros(X.shape) omegaList = [] for i in range(m): omegaList.append(numpy.random.permutation(n)[0:5]) X[i, omegaList[i]] = 1 X = sppy.csarray(X) orderedItems = MCEvaluatorCython.recommendAtk(U, V, k, X) orderedItems2 = MCEvaluator.recommendAtk(U, V, k, omegaList=omegaList) nptst.assert_array_equal(orderedItems[orderedItems2 != -1], orderedItems2[orderedItems2 != -1]) for i in range(m): items = numpy.intersect1d(omegaList[i], orderedItems[i, :]) self.assertEquals(items.shape[0], 0) #items = numpy.union1d(omegaList[i], orderedItems[i, :]) #items = numpy.intersect1d(items, orderedItems2[i, :]) #nptst.assert_array_equal(items, numpy.sort(orderedItems2[i, :])) #Now let's have an all zeros X X = sppy.csarray(X.shape) orderedItems = MCEvaluatorCython.recommendAtk(U, V, k, X) orderedItems2 = MCEvaluator.recommendAtk(U, V, k) nptst.assert_array_equal(orderedItems, orderedItems2)
def profileSampleUsers(self): m = 10000 n = 50000 k = 5 u = 0.01 w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) print(X.nnz) k2 = 100000 ProfileUtils.profile('Sampling.sampleUsers2(X, k2)', globals(), locals())
def testLearnModel(self): m = 50 n = 20 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) u = 0.1 w = 1 - u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=5.0, eps=eps, stochastic=False) U, V = maxLocalAuc.learnModel(X) maxLocalAuc.stochastic = True U, V = maxLocalAuc.learnModel(X) #Test case where we do not have validation set maxLocalAuc.validationUsers = 0.0 U, V = maxLocalAuc.learnModel(X)
def testAverageAuc(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) auc = MCEvaluator.averageAuc(X, U, V) u = 0.0 auc2 = MCEvaluator.localAUC(X, U, V, u) self.assertAlmostEquals(auc, auc2)
def testRecommendAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) k = 10 orderedItems, scores = MCEvaluator.recommendAtk(U, V, k, verbose=True) # Now do it manually Z = U.dot(V.T) orderedItems2 = Util.argmaxN(Z, k) scores2 = numpy.fliplr(numpy.sort(Z, 1))[:, 0:k] nptst.assert_array_equal(orderedItems, orderedItems2) nptst.assert_array_equal(scores, scores2) # Test case where we have a set of training indices to remove # Let's create a random omegaList omegaList = [] for i in range(m): omegaList.append(numpy.random.permutation(n)[0:5]) orderedItems = MCEvaluator.recommendAtk(U, V, k, omegaList=omegaList) orderedItems2 = MCEvaluator.recommendAtk(U, V, k) # print(omegaList) # print(orderedItems) # print(orderedItems2) for i in range(m): items = numpy.intersect1d(omegaList[i], orderedItems[i, :]) self.assertEquals(items.shape[0], 0) items = numpy.union1d(omegaList[i], orderedItems[i, :]) items = numpy.intersect1d(items, orderedItems2[i, :]) nptst.assert_array_equal(items, numpy.sort(orderedItems2[i, :]))
def testLearnModel(self): m = 50 n = 20 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) u = 0.1 w = 1-u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=5.0, eps=eps, stochastic=False) U, V = maxLocalAuc.learnModel(X) maxLocalAuc.stochastic = True U, V = maxLocalAuc.learnModel(X) #Test case where we do not have validation set maxLocalAuc.validationUsers = 0.0 U, V = maxLocalAuc.learnModel(X)
def testPrecisionAtK(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) # print(MCEvaluator.precisionAtK(X, U*s, V, 2)) orderedItems = MCEvaluator.recommendAtk(U, V, n) self.assertAlmostEquals(MCEvaluator.precisionAtK(X, orderedItems, n), X.nnz / float(m * n)) k = 2 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) precisions = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] precisions[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(k) self.assertEquals(precision.mean(), precisions.mean()) # Now try random U and V U = numpy.random.rand(m, 3) V = numpy.random.rand(m, 3) orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) precisions = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] precisions[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(k) self.assertEquals(precision.mean(), precisions.mean())
def testModelSelectMaxNorm(self): m = 10 n = 20 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) os.system('taskset -p 0xffffffff %d' % os.getpid()) eps = 0.001 k = 5 maxLocalAuc = MaxLocalAUC(k, w, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 5 maxLocalAuc.recordStep = 1 maxLocalAuc.validationSize = 3 maxLocalAuc.metric = "f1" maxLocalAuc.modelSelectNorm(X)
def testModelSelectMaxNorm(self): m = 10 n = 20 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) os.system('taskset -p 0xffffffff %d' % os.getpid()) eps = 0.001 k = 5 maxLocalAuc = MaxLocalAUC(k, w, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 5 maxLocalAuc.recordStep = 1 maxLocalAuc.validationSize = 3 maxLocalAuc.metric = "f1" maxLocalAuc.modelSelectNorm(X)
def testRecommendAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) k = 10 X = numpy.zeros(X.shape) omegaList = [] for i in range(m): omegaList.append(numpy.random.permutation(n)[0:5]) X[i, omegaList[i]] = 1 X = sppy.csarray(X) orderedItems = MCEvaluatorCython.recommendAtk(U, V, k, X) orderedItems2 = MCEvaluator.recommendAtk(U, V, k, omegaList=omegaList) nptst.assert_array_equal(orderedItems[orderedItems2!=-1], orderedItems2[orderedItems2!=-1]) for i in range(m): items = numpy.intersect1d(omegaList[i], orderedItems[i, :]) self.assertEquals(items.shape[0], 0) #items = numpy.union1d(omegaList[i], orderedItems[i, :]) #items = numpy.intersect1d(items, orderedItems2[i, :]) #nptst.assert_array_equal(items, numpy.sort(orderedItems2[i, :])) #Now let's have an all zeros X X = sppy.csarray(X.shape) orderedItems = MCEvaluatorCython.recommendAtk(U, V, k, X) orderedItems2 = MCEvaluator.recommendAtk(U, V, k) nptst.assert_array_equal(orderedItems, orderedItems2)
def testRecallAtK(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) orderedItems = MCEvaluator.recommendAtk(U, V, n) self.assertAlmostEquals(MCEvaluator.recallAtK(X, orderedItems, n), 1.0) k = 2 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) recalls = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] recalls[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(nonzeroRow.shape[0]) self.assertEquals(recall.mean(), recalls.mean()) # Now try random U and V U = numpy.random.rand(m, 3) V = numpy.random.rand(m, 3) orderedItems = MCEvaluator.recommendAtk(U, V, k) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) recalls = numpy.zeros(m) for i in range(m): nonzeroRow = X.toarray()[i, :].nonzero()[0] recalls[i] = numpy.intersect1d(scoreInds[i, :], nonzeroRow).shape[0] / float(nonzeroRow.shape[0]) self.assertEquals(recall.mean(), recalls.mean())
def testReciprocalRankAtk(self): m = 20 n = 50 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, 0.5, verbose=True, csarray=True) k = 5 orderedItems = numpy.random.randint(0, n, m*k) orderedItems = numpy.reshape(orderedItems, (m, k)) orderedItems = numpy.array(orderedItems, numpy.int32) (indPtr, colInds) = X.nonzeroRowsPtr() indPtr = numpy.array(indPtr, numpy.uint32) colInds = numpy.array(colInds, numpy.uint32) rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) rrs2 = numpy.zeros(m) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]] for j in range(k): if orderedItems[i, j] in omegai: rrs2[i] = 1/float(1+j) break nptst.assert_array_equal(rrs, rrs2) #Test case where no items are in ranking orderedItems = numpy.ones((m, k), numpy.int32) * (n+1) rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(rrs, numpy.zeros(m)) #Now, make all items rank 2 for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]] orderedItems[i, 1] = omegai[0] rrs = MCEvaluatorCython.reciprocalRankAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(rrs, numpy.ones(m)*0.5)
def testRestrictOmega(self): m = 50 n = 100 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) indPtr, colInds = SparseUtils.getOmegaListPtr(X) runs = 100 for i in range(runs): colSubset = numpy.random.choice(n, 20, replace=False) newIndPtr, newColInds = restrictOmega(indPtr, colInds, colSubset) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] omegai2 = newColInds[newIndPtr[i]:newIndPtr[i + 1]] a = numpy.setdiff1d(omegai, omegai2) self.assertEquals(numpy.intersect1d(a, colSubset).shape[0], 0)
def testParallelLearnModel(self): numpy.random.seed(21) m = 500 n = 200 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) from wallhack.rankingexp.DatasetUtils import DatasetUtils X, U, V = DatasetUtils.syntheticDataset1() u = 0.1 w = 1 - u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=1.0, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 3 maxLocalAuc.recordStep = 1 maxLocalAuc.rate = "optimal" maxLocalAuc.t0 = 2.0 maxLocalAuc.validationUsers = 0.0 maxLocalAuc.numProcesses = 4 os.system('taskset -p 0xffffffff %d' % os.getpid()) print(X.nnz / maxLocalAuc.numAucSamples) U, V = maxLocalAuc.parallelLearnModel(X)
def profileLocalAucApprox(self): m = 500 n = 1000 k = 10 X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) u = 0.1 w = 1 - u numAucSamples = 200 omegaList = SparseUtils.getOmegaList(X) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) numRuns = 10 def run(): for i in range(numRuns): MCEvaluator.localAUCApprox(X, U, V, omegaList, numAucSamples, r) ProfileUtils.profile('run()', globals(), locals())
def testF1Atk(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) self.assertAlmostEquals( MCEvaluator.f1AtK(X, orderedItems, n, verbose=False), 2 * r / float(n) / (1 + r / float(n))) m = 20 n = 50 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) k = 5 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2) #Test case where we get a zero precision or recall orderedItems[5, :] = -1 precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): if precision[i] + recall[i] != 0: f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2)
def testShuffleSplitRows(self): m = 10 n = 16 k = 5 u = 0.5 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) #print(X.toarray()) k2 = 5 testSize = 2 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=True) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] self.assertEquals(trainX.storagetype, "row") self.assertEquals(testX.storagetype, "row") nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] self.assertEquals(trainX.storagetype, "col") self.assertEquals(testX.storagetype, "col") nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, csarray=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(numpy.ravel(testX.sum(1)), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) testSize = 0 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) self.assertEquals(testX.nnz, 0) #Test sampling a subset of the rows testSize = 2 numRows = 5 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=numRows, rowMajor=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) self.assertEquals(numpy.nonzero(testX.sum(1))[0].shape[0], numRows) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) self.assertEquals(testX.nnz, testSize*numRows) #Make sure column probabilities are correct w = 0.0 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) testSize = 5 k2 = 500 colProbs = numpy.arange(0, n, dtype=numpy.float)+1 colProbs /= colProbs.sum() trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs2 = numpy.zeros(n) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] colProbs2 += testX.sum(0) colProbs2 /= colProbs2.sum() nptst.assert_array_almost_equal(colProbs, colProbs2, 2) #Now test when probabilities are uniform colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs = None trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs2 = numpy.zeros(n) colProbs3 = numpy.zeros(n) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] colProbs2 += testX.sum(0) trainX = trainTestXs2[i][0] testX = trainTestXs2[i][1] colProbs3 += testX.sum(0) colProbs2 /= colProbs2.sum() colProbs3 /= colProbs3.sum() nptst.assert_array_almost_equal(colProbs2, colProbs3, 2) #Test when numRows=m numpy.random.seed(21) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=m) numpy.random.seed(21) trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize) nptst.assert_array_equal(trainTestXs[0][0].toarray(), trainTestXs2[0][0].toarray()) nptst.assert_array_equal(trainTestXs[0][1].toarray(), trainTestXs2[0][1].toarray())