def train_test(density, nmodes, side_info): np.random.seed(seed) Us = [np.random.randn(i * 4, 1) for i in range(1, nmodes + 1)] subscripts = [[i + 1, 0] for i in range(nmodes)] Y = np.einsum(*[j for i in zip(Us, subscripts) for j in i]) if density < 1.: _, Y = smurff.make_train_test(Y, density, seed=seed) Ytrain, Ytest = smurff.make_train_test(Y, 0.5, seed=seed) return Ytrain, Ytest, side_info(Us[0])
def test_bmf_pp(self): Y = scipy.sparse.rand(30, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) session = smurff.BPMFSession(Y, Ytest=Ytest, num_latent=4, verbose=verbose, burnin=5, nsamples=20, save_freq=1) session.run() predict_session = session.makePredictSession() sess_rmse = float(predict_session.statsYTest()["rmse_avg"]) Ypred, Yvar = predict_session.predictionsYTest() calc_rmse = math.sqrt( mean_squared_error(Ytest.tocoo().data, Ypred.tocoo().data)) self.assertAlmostEqual(sess_rmse, calc_rmse, 2) for m in range(predict_session.nmodes): calc_mu, calc_Lambda = calc_posteriorMeanPrec(predict_session, m) sess_mu, sess_Lambda = predict_session.postMuLambda(m) np.testing.assert_almost_equal(calc_mu, sess_mu) np.testing.assert_almost_equal(calc_Lambda, sess_Lambda)
def test_macau_dense_probit(self): A = np.random.randn(25, 2) B = np.random.randn(3, 2) idx = list( itertools.product(np.arange(A.shape[0]), np.arange(B.shape[0]))) df = pd.DataFrame(np.asarray(idx), columns=["A", "B"]) df["value"] = (np.array([np.sum(A[i[0], :] * B[i[1], :]) for i in idx]) > 0.0).astype(np.float64) Ytrain, Ytest = smurff.make_train_test(df, 0.2) threshold = 0.5 # since we sample from mu(0,1) trainSession = smurff.TrainSession(priors=['macau', 'normal'], num_latent=4, threshold=threshold, burnin=200, nsamples=200, verbose=False) trainSession.addTrainAndTest(Ytrain, Ytest, smurff.ProbitNoise(threshold)) trainSession.addSideInfo(0, A, direct=True) predictions = trainSession.run() auc = smurff.calc_auc(predictions, 0.5) self.assertTrue( auc > 0.55, msg= "Probit factorization (with dense side) gave AUC below 0.55 (%f)." % auc)
def gen_and_write(shape, num_latent, density, center="none"): Y, X, W = gen_matrix(shape, num_latent, density) Ytrain, Ytest = make_train_test(Y, 0.8) shape_str = "_".join(map(str, shape)) dirname = "data_%s_%d_%d_%s" % (shape_str, num_latent, int( density * 100), center) if os.path.exists(dirname): print("Already exists: %s. Skipping" % dirname) return print("%s..." % dirname) os.makedirs(dirname) # PAY ATTENTION TO AXIS ORDER if (center == "row"): Y = preprocessing.scale(Y, axis=0, with_std=False) elif (center == "col"): Y = preprocessing.scale(Y, axis=1, with_std=False) elif (center == "global"): Y.data = Y.data - np.mean(Y.data) else: assert center == "none" write_matrix(dirname, "train", Ytrain) write_matrix(dirname, "test", Ytest) write_matrix(dirname, "rows", X) write_matrix(dirname, "cols", W)
def test_bpmf_tensor3(self): A = np.random.randn(15, 2) B = np.random.randn(20, 2) C = np.random.randn(1, 2) idx = list( itertools.product(np.arange(A.shape[0]), np.arange(B.shape[0]), np.arange(C.shape[0]))) df = pd.DataFrame(np.asarray(idx), columns=["A", "B", "C"]) df["value"] = np.array( [np.sum(A[i[0], :] * B[i[1], :] * C[i[2], :]) for i in idx]) Ytrain, Ytest = smurff.make_train_test(df, 0.2) predictions = smurff.smurff(Ytrain, Ytest=Ytest, priors=['normal', 'normal', 'normal'], num_latent=4, verbose=verbose, burnin=20, nsamples=20) rmse = smurff.calc_rmse(predictions) self.assertTrue(rmse < 0.5, msg="Tensor factorization gave RMSE above 0.5 (%f)." % rmse)
def test_bmf_pp(self): np.random.seed(seed) Y = scipy.sparse.rand(30, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5, seed=seed) trainSession = smurff.BPMFSession( Y, is_scarce=True, Ytest=Ytest, num_latent=4, verbose=verbose, burnin=20, nsamples=20, save_freq=1, seed=seed, save_name=smurff.helper.temp_savename()) trainSession.run() predict_session = trainSession.makePredictSession() sess_rmse = float(predict_session.statsYTest()["rmse_avg"]) Ypred, Yvar = predict_session.predictionsYTest() Yt_i, Yt_j, Yt_v = scipy.sparse.find(Ytest) Yp_i, Yp_j, Yp_v = scipy.sparse.find(Ypred) assert (Yp_i == Yt_i).all() and (Yp_j == Yt_j).all() calc_rmse = math.sqrt(mean_squared_error(Yt_v, Yp_v)) self.assertAlmostEqual(sess_rmse, calc_rmse, 4) for m in range(predict_session.nmodes): calc_mu, calc_Lambda = calc_posteriorMeanPrec(predict_session, m) sess_mu, sess_Lambda = predict_session.postMuLambda(m) np.testing.assert_almost_equal(calc_mu, sess_mu) np.testing.assert_almost_equal(calc_Lambda, sess_Lambda)
def test_macau_tensor(self): shape = [30, 4, 2] A = np.random.randn(shape[0], 2) B = np.random.randn(shape[1], 2) C = np.random.randn(shape[2], 2) idx = list( itertools.product(np.arange(shape[0]), np.arange(shape[1]), np.arange(shape[2]))) df = pd.DataFrame(np.asarray(idx), columns=["A", "B", "C"]) df["value"] = np.array( [np.sum(A[i[0], :] * B[i[1], :] * C[i[2], :]) for i in idx]) Ytrain, Ytest = smurff.make_train_test(df, 0.2, shape=shape) Acoo = scipy.sparse.coo_matrix(A) predictions = smurff.macau(Ytrain=Ytrain, Ytest=Ytest, side_info=[Acoo, None, None], direct=True, num_latent=4, verbose=verbose, burnin=200, nsamples=200) rmse = smurff.calc_rmse(predictions) self.assertTrue(rmse < 1., msg="Tensor factorization gave RMSE above 1. (%f)." % rmse)
def test_bpmf_numerictest(self): X = sp.rand(15, 10, 0.2) Xt = 0.3 X, Xt = smurff.make_train_test(X, Xt) smurff.bpmf(X, Ytest=Xt, num_latent=10, burnin=10, nsamples=15, verbose=verbose)
def test_bpmf(self): Y = sp.rand(10, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) predictions = smurff.bpmf(Y, Ytest=Ytest, num_latent=4, verbose=verbose, burnin=50, nsamples=50) self.assertEqual(Ytest.nnz, len(predictions))
def run_train_session(self, nmodes, sparse): shape = range(2, nmodes+2) # 2, 3, 4, ... Y = np.random.rand(*shape) if sparse: # make Y SparseTensor through make_train_test _, Y = smurff.make_train_test(Y, 0.5) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1) priors = ['normal'] * nmodes trainSession = smurff.TrainSession(priors = priors, num_latent=4, burnin=10, nsamples=15, verbose=verbose, save_freq = 1, save_name = smurff.helper.temp_savename()) trainSession.addTrainAndTest(self.Ytrain, self.Ytest) trainSession.init() while trainSession.step(): pass return trainSession
def test_gfa_3view(self): Y = scipy.sparse.rand(10, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) predictions = smurff.gfa([Y, Y, Y], Ytest=Ytest, num_latent=4, verbose=verbose, burnin=5, nsamples=5) self.assertEqual(Ytest.nnz, len(predictions))
def test_bpmf(self): Y = scipy.sparse.rand(10, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) predictions = smurff.smurff(Y, Ytest=Ytest, priors=['normal', 'normal'], num_latent=4, verbose=False, burnin=50, nsamples=50) self.assertEqual(Ytest.nnz, len(predictions))
def test_bpmf_numerictest(self): X = scipy.sparse.rand(15, 10, 0.2) Xt = 0.3 X, Xt = smurff.make_train_test(X, Xt) smurff.smurff(X, Ytest=Xt, priors=['normal', 'normal'], num_latent=10, burnin=10, nsamples=15, verbose=False)
def test_bpmf_dense_matrix_dense_2d_tensor(self): np.random.seed(1234) # Generate train matrix rows, cols and vals train_shape = (5, 4) sparse_random = sp.random(5, 4, density=1.0) train_dense_matrix = sparse_random.todense() _, test_sparse_matrix = smurff.make_train_test(sparse_random, 0.2) # Create train and test sparse train_sparse_matrix = sp.coo_matrix(train_dense_matrix) # acutally dense test_sparse_matrix = test_sparse_matrix.tocoo() # Create train and test sparse representations of dense tensors train_sparse_tensor = smurff.SparseTensor(pd.DataFrame({ '0': train_sparse_matrix.row, '1': train_sparse_matrix.col, 'v': train_sparse_matrix.data }), train_shape) test_sparse_tensor = smurff.SparseTensor(pd.DataFrame({ '0': test_sparse_matrix.row, '1': test_sparse_matrix.col, 'v': test_sparse_matrix.data }), train_shape) # Run SMURFF sparse_matrix_predictions = smurff.bpmf(train_dense_matrix, Ytest=test_sparse_matrix, num_latent=4, num_threads=1, verbose=verbose, burnin=50, nsamples=50, seed=1234) sparse_tensor_predictions = smurff.bpmf(train_sparse_tensor, Ytest=test_sparse_tensor, num_latent=4, num_threads=1, verbose=verbose, burnin=50, nsamples=50, seed=1234) # Sort and compare coords and predicted values sparse_matrix_predictions.sort() sparse_tensor_predictions.sort() self.assertEqual(len(sparse_matrix_predictions), len(sparse_tensor_predictions)) for m, t in zip(sparse_matrix_predictions, sparse_tensor_predictions): self.assertEqual(m.coords, t.coords) self.assertAlmostEqual(m.pred_1sample, t.pred_1sample)
def test_macau_univariate(self): Y = scipy.sparse.rand(10, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) side1 = scipy.sparse.coo_matrix(np.random.rand(10, 2)) side2 = scipy.sparse.coo_matrix(np.random.rand(20, 3)) predictions = smurff.macau(Y, Ytest=Ytest, side_info=[side1, side2], univariate=True, num_latent=4, verbose=verbose, burnin=200, nsamples=200) self.assertEqual(Ytest.nnz, len(predictions))
def test_macau_univariate(self): Y = scipy.sparse.rand(10, 20, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) side1 = scipy.sparse.coo_matrix(np.random.rand(10, 2)) side2 = scipy.sparse.coo_matrix(np.random.rand(20, 3)) predictions = smurff.smurff(Y, Ytest=Ytest, priors=['macauone', 'macauone'], side_info=[side1, side2], num_latent=4, verbose=False, burnin=50, nsamples=50) self.assertEqual(Ytest.nnz, len(predictions))
def test_make_train_test(self): nnz = 10 * 8 * 3 idx = list(itertools.product(np.arange(10), np.arange(8), np.arange(3))) df = pd.DataFrame(np.asarray(idx), columns=["A", "B", "C"]) df["value"] = np.arange(float(nnz)) Ytr, Yte = smurff.make_train_test(df, 0.4) self.assertEqual(Ytr.nnz, nnz * 0.6) self.assertEqual(Yte.nnz, nnz * 0.4) A1 = np.zeros((10, 8, 3)) A2 = np.zeros((10, 8, 3)) A1[df.A, df.B, df.C] = df.value A2[Ytr.columns[0], Ytr.columns[1], Ytr.columns[2]] = Ytr.values A2[Yte.columns[0], Yte.columns[1], Yte.columns[2]] = Yte.values self.assertTrue(np.allclose(A1, A2))
def test_macau(self): Ydense = np.random.rand(10, 20) r = np.random.permutation(10*20)[:40] # 40 random samples from 10*20 matrix side1 = Ydense[:,1:2] side2 = Ydense[1:2,:].transpose() Y = scipy.sparse.coo_matrix(Ydense) # convert to sparse Y = scipy.sparse.coo_matrix( (Y.data[r], (Y.row[r], Y.col[r])), shape=Y.shape ) Y, Ytest = smurff.make_train_test(Y, 0.5) predictions = smurff.smurff(Y, Ytest=Ytest, priors=['macau', 'macau'], side_info=[side1, side2], direct=True, # side_info_noises=[[('fixed', 1.0, None, None, None)], [('adaptive', None, 0.5, 1.0, None)]], num_latent=4, verbose=False, burnin=50, nsamples=50)
def run_train_session(self): Y = scipy.sparse.rand(15, 10, 0.2) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5) nmodes = len(self.Ytrain.shape) priors = ['normal'] * nmodes session = smurff.TrainSession(priors=priors, num_latent=4, burnin=10, nsamples=15, verbose=verbose, save_freq=1) session.addTrainAndTest(self.Ytrain, self.Ytest) session.init() while session.step(): pass return session
def test_macau(self): Ydense = np.random.rand(10, 20) r = np.random.permutation( 10 * 20)[:40] # 40 random samples from 10*20 matrix side1 = Ydense[:, 1:2] side2 = Ydense[1:2, :].transpose() Y = scipy.sparse.coo_matrix(Ydense) # convert to sparse Y = scipy.sparse.coo_matrix((Y.data[r], (Y.row[r], Y.col[r])), shape=Y.shape) Y, Ytest = smurff.make_train_test(Y, 0.5) predictions = smurff.macau(Y, Ytest=Ytest, side_info=[side1, side2], num_latent=4, verbose=verbose, burnin=200, nsamples=200) self.assertEqual(Ytest.nnz, len(predictions))
def run_train_session(self): Ydense = np.random.normal(size = (10, 20)).reshape((10,20)) r = np.random.permutation(10*20)[:40] # 40 random samples from 10*20 matrix Y = scipy.sparse.coo_matrix(Ydense) # convert to sparse Y = scipy.sparse.coo_matrix( (Y.data[r], (Y.row[r], Y.col[r])), shape=Y.shape ) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5) self.side_info = Ydense nmodes = len(self.Ytrain.shape) priors = ['normal'] * nmodes session = smurff.TrainSession(priors = priors, num_latent=32, burnin=10, nsamples=15, verbose=verbose, save_freq = 1) session.addTrainAndTest(self.Ytrain, self.Ytest) session.addSideInfo(0, self.side_info) session.run() return session
def run_train_session(self, nmodes, density): shape = range(5, nmodes + 5) # 5, 6, 7, ... Y, X = smurff.generate.gen_tensor(shape, 3, density) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1) priors = ['normal'] * nmodes trainSession = smurff.TrainSession( priors=priors, num_latent=4, burnin=10, nsamples=nsamples, verbose=verbose, save_freq=1, save_name=smurff.helper.temp_savename()) trainSession.addTrainAndTest(self.Ytrain, self.Ytest) for i, x in enumerate(X): trainSession.addSideInfo(i, x) trainSession.init() while trainSession.step(): pass return trainSession, Y, X
def train_test(self): Y = scipy.sparse.rand(15, 10, 0.2) Y, Ytest = smurff.make_train_test(Y, 0.5) return Y, Ytest
def test_make_train_test(self): X = scipy.sparse.rand(15, 10, 0.2) Xtr, Xte = smurff.make_train_test(X, 0.5) self.assertEqual(X.nnz, Xtr.nnz + Xte.nnz) diff = np.linalg.norm((X - Xtr - Xte).todense()) self.assertEqual(diff, 0.0)
), ] for url, expected_sha, output in urls: if os.path.isfile(output): actual_sha = sha256(open(output, "rb").read()).hexdigest() if (expected_sha == actual_sha): continue print("download %s" % output) urllib.request.urlretrieve(url, output) ic50 = sio.mmread("chembl-IC50-346targets.mm") feat = sio.mmread("chembl-IC50-compound-feat.mm") ic50_100c = ic50.tocsr()[0:100, :] ic50_100c_train, ic50_100c_test = smurff.make_train_test(ic50_100c, 0.2, 1234) # 0,1 binary for probit ic50_01 = ic50.copy() ic50_01.data = (ic50_01.data >= 6) * 1. # -1,+1 ic50_11 = ic50.copy() ic50_11.data = ((ic50.data >= 6) * 2.) - 1. feat_100 = feat.tocsr()[0:100, :] feat_100 = feat_100[:, feat_100.getnnz(0) > 0] feat_100_dense = feat_100.todense() generated_files = [ (
#!/usr/bin/env python import smurff import matrix_io as mio #load data ic50 = mio.read_matrix("chembl-IC50-346targets.mm") ic50_train, ic50_test = smurff.make_train_test(ic50, 0.2) ic50_threshold = 6. session = smurff.TrainSession( priors=['normal', 'normal'], num_latent=32, burnin=10, nsamples=10, # Using threshold of 6. to calculate AUC on test data threshold=ic50_threshold) ## using activity threshold pIC50 > 6. to binarize train data session.addTrainAndTest(ic50_train, ic50_test, smurff.ProbitNoise(ic50_threshold)) predictions = session.run() print("RMSE = %.2f" % smurff.calc_rmse(predictions)) print("AUC = %.2f" % smurff.calc_auc(predictions, ic50_threshold))