def macau(self, side_info, direct, expected): args = self.get_default_opts() for d in range(2): if side_info[d] != None: args["priors"][d] = 'macau' session = smurff.TrainSession(**args) Ytrain = TestExCAPE_py.data["train.sdm"] Ytest = TestExCAPE_py.data["test.sdm"] session.addTrainAndTest(Ytrain, Ytest, self.get_train_noise()) for d in range(2): if side_info[d] != None: session.addSideInfo(d, TestExCAPE_py.data[side_info[d]], self.get_side_noise(), direct=direct) session.init() start = time() while session.step(): pass rmse = session.getRmseAvg() stop = time() elapsed = stop - start self.assertLess(rmse, expected[0]) self.assertGreater(rmse, expected[1]) self.assertLess(elapsed, expected[2])
def test_macau_dense_probit(self): A = np.random.randn(25, 2) B = np.random.randn(3, 2) idx = list( itertools.product(np.arange(A.shape[0]), np.arange(B.shape[0]))) df = pd.DataFrame(np.asarray(idx), columns=["A", "B"]) df["value"] = (np.array([np.sum(A[i[0], :] * B[i[1], :]) for i in idx]) > 0.0).astype(np.float64) Ytrain, Ytest = smurff.make_train_test_df(df, 0.2) threshold = 0.5 # since we sample from mu(0,1) trainSession = smurff.TrainSession(priors=['macau', 'normal'], num_latent=4, threshold=threshold, burnin=20, nsamples=20, verbose=False) trainSession.addTrainAndTest(Ytrain, Ytest, smurff.ProbitNoise(threshold)) trainSession.addSideInfo(0, A, direct=True) predictions = trainSession.run() rmse = smurff.calc_rmse(predictions) self.assertTrue( rmse > 0.55, msg= "Probit factorization (with dense side) gave AUC below 0.55 (%f)." % rmse)
def run_session(self, noise_model): Ytrain, Ytest = self.train_test() si = self.side_info() nmodes = len(Ytrain.shape) priors = ['normal'] * nmodes if si is not None: priors[0] = 'macau' session = smurff.TrainSession(priors=priors, num_latent=10, burnin=10, nsamples=15, verbose=verbose) if si is None: session.addTrainAndTest(Ytrain, Ytest, noise_model) elif isinstance(noise_model, smurff.ProbitNoise): session.addSideInfo(0, si) session.addTrainAndTest(Ytrain, Ytest, noise_model) else: session.addSideInfo(0, si, noise_model) session.addTrainAndTest(Ytrain, Ytest) session.init() while session.step(): pass predictions = session.getTestPredictions() self.assertEqual(Ytest.nnz, len(predictions)) self.assertLess(session.getRmseAvg(), 10.) return predictions
def train_session(root, train, test, sideinfo=None): import shutil shutil.rmtree(root, ignore_errors=True) os.makedirs(root) print("save prefix = ", root) trainSession = smurff.TrainSession( num_latent=4, burnin=800, nsamples=100, verbose=global_verbose, save_freq=1, save_prefix=root, ) trainSession.addTrainAndTest(train, test, smurff.FixedNoise(1.0)) if sideinfo is not None: trainSession.addSideInfo(0, sideinfo, smurff.FixedNoise(10.), direct=True) predictions = trainSession.run() rmse = smurff.calc_rmse(predictions) #print("RMSE = %.2f%s" % (rmse, "" if sideinfo is None else " (with sideinfo)" )) return rmse
def test_pybind(): trainSession = smurff.TrainSession(priors = ["normal", "normal"], verbose = 2 ) Y = np.array([[1.,2.],[3.,4.]]) trainSession.setTrain(Y) trainSession.setTest(sp.csr_matrix(Y)) results = trainSession.run()
def test_noise_model(density, nmodes, side_info, noise_model): Ytrain, Ytest, si = train_test(density, nmodes, side_info) nm = noise_model() priors = ['normal'] * nmodes if si is not None: priors[0] = 'macau' trainSession = smurff.TrainSession(priors=priors, num_latent=8, burnin=20, nsamples=20, threshold=.0, seed=seed, verbose=verbose) trainSession.addTrainAndTest(Ytrain, Ytest, nm) if not si is None: trainSession.addSideInfo(0, si, smurff.SampledNoise(1.), direct=True) trainSession.init() while trainSession.step(): pass predictions = trainSession.getTestPredictions() assert Ytest.nnz == len(predictions) if isinstance(nm, smurff.ProbitNoise): assert trainSession.getStatus().auc_avg <= 1. assert trainSession.getStatus().auc_avg >= 0. else: assert trainSession.getRmseAvg() < 10. return predictions
def run_train_session(self, nmodes, sparse): shape = range(2, nmodes+2) # 2, 3, 4, ... Y = np.random.rand(*shape) if sparse: # make Y SparseTensor through make_train_test _, Y = smurff.make_train_test(Y, 0.5) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1) priors = ['normal'] * nmodes trainSession = smurff.TrainSession(priors = priors, num_latent=4, burnin=10, nsamples=15, verbose=verbose, save_freq = 1, save_name = smurff.helper.temp_savename()) trainSession.addTrainAndTest(self.Ytrain, self.Ytest) trainSession.init() while trainSession.step(): pass return trainSession
def run_train_session(self): Y = scipy.sparse.rand(15, 10, 0.2) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5) nmodes = len(self.Ytrain.shape) priors = ['normal'] * nmodes session = smurff.TrainSession(priors=priors, num_latent=4, burnin=10, nsamples=15, verbose=verbose, save_freq=1) session.addTrainAndTest(self.Ytrain, self.Ytest) session.init() while session.step(): pass return session
def run_train_session(self): Ydense = np.random.normal(size = (10, 20)).reshape((10,20)) r = np.random.permutation(10*20)[:40] # 40 random samples from 10*20 matrix Y = scipy.sparse.coo_matrix(Ydense) # convert to sparse Y = scipy.sparse.coo_matrix( (Y.data[r], (Y.row[r], Y.col[r])), shape=Y.shape ) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.5) self.side_info = Ydense nmodes = len(self.Ytrain.shape) priors = ['normal'] * nmodes session = smurff.TrainSession(priors = priors, num_latent=32, burnin=10, nsamples=15, verbose=verbose, save_freq = 1) session.addTrainAndTest(self.Ytrain, self.Ytest) session.addSideInfo(0, self.side_info) session.run() return session
def macau(self, dirname, expected): args = self.get_default_opts() trainSession = smurff.TrainSession(**args) Ytrain = mio.read_matrix(join(dirname, "train.sdm")) Ytest = mio.read_matrix(join(dirname, "test.sdm")) trainSession.addTrainAndTest(Ytrain, Ytest, self.get_train_noise()) sideinfo = mio.read_matrix(join(dirname, "rows.ddm")) trainSession.addSideInfo(0, sideinfo, self.get_side_noise(), direct = True) trainSession.init() start = time() while trainSession.step(): pass rmse = trainSession.getRmseAvg() stop = time() elapsed = stop - start self.assertLess(rmse, expected[0]) self.assertGreater(rmse, expected[1]) self.assertLess(elapsed, expected[2])
def read_ini(fname): from configparser import ConfigParser cfg = ConfigParser() cfg.read(fname) priors = read_list(cfg["global"], "prior_") seed = cfg.getint("global", "random_seed") if cfg.getboolean( "global", "random_seed_set") else None threshold = cfg.getfloat("global", "threshold") if cfg.getboolean( "global", "classify") else None session = smurff.TrainSession( priors, cfg.getint("global", "num_latent"), cfg.getint("global", "num_threads", fallback=None), cfg.getint("global", "burnin"), cfg.getint("global", "nsamples"), seed, threshold, cfg.getint("global", "verbose"), cfg.get("global", "save_name", fallback=smurff.temp_savename()), cfg.getint("global", "save_freq", fallback=None), cfg.getint("global", "checkpoint_freq", fallback=None), ) data, matrix_type, noise, *_ = read_data(cfg, "train") session.setTrain(data, noise, matrix_type == "scarce") data, *_ = read_data(cfg, "test") session.setTest(data) for mode in range(len(priors)): section = "side_info_%d" % mode if section in cfg.keys(): data, matrix_type, noise, pos, direct, tol = read_data( cfg, section) session.addSideInfo(mode, data, noise, direct) return session
def run_train_session(self, nmodes, density): shape = range(5, nmodes + 5) # 5, 6, 7, ... Y, X = smurff.generate.gen_tensor(shape, 3, density) self.Ytrain, self.Ytest = smurff.make_train_test(Y, 0.1) priors = ['normal'] * nmodes trainSession = smurff.TrainSession( priors=priors, num_latent=4, burnin=10, nsamples=nsamples, verbose=verbose, save_freq=1, save_name=smurff.helper.temp_savename()) trainSession.addTrainAndTest(self.Ytrain, self.Ytest) for i, x in enumerate(X): trainSession.addSideInfo(i, x) trainSession.init() while trainSession.step(): pass return trainSession, Y, X
#!/usr/bin/env python import smurff import pickle Y = smurff.matrix_io.read_matrix("ratings_1k_random.sdm") Ytrain, Ytest = smurff.prepare.make_train_test(Y, 0.2) sideinfo = smurff.matrix_io.read_matrix("features_1k_random.sdm") trainSession = smurff.TrainSession(num_latent=8, burnin=200, nsamples=200, verbose=1, save_name="movielens.hdf5", save_freq=1) trainSession.addTrainAndTest(Ytrain, Ytest) trainSession.addSideInfo(0, sideinfo, smurff.FixedNoise(10.)) trainSession.run()
import numpy as np from time import time #load data ic50_train = mio.read_matrix("chembl-IC50-346targets-100compounds-train.sdm") ic50_test = mio.read_matrix("chembl-IC50-346targets-100compounds-test.sdm") #feat = mio.read_matrix("chembl-IC50-100compounds-feat-dense.ddm") feat = mio.read_matrix("chembl-IC50-100compounds-feat.sdm") ic50_threshold = 6. trainSession = smurff.TrainSession( verbose = 1, priors = ['macau', 'normal'], num_latent=32, num_threads=1, seed=1234, burnin=400, nsamples=200, # Using threshold of 6. to calculate AUC on test data threshold=ic50_threshold) ## using activity threshold pIC50 > 6. to binarize train data trainSession.addTrainAndTest(ic50_train, ic50_test) trainSession.addSideInfo(0, feat, noise=smurff.SampledNoise(), direct=True) start = time() predictions = trainSession.run() stop = time() print("time = %.2f" % (stop - start)) print("RMSE = %.2f" % smurff.calc_rmse(predictions))
import logging import numpy as np import scipy.sparse as sp import smurff # logging.getLogger().setLevel(logging.INFO) trainSession = smurff.TrainSession(priors=["normal", "normal"]) Y = np.array([[1., 2.], [3., 4.]]) trainSession.setTrain(Y) trainSession.setTest(sp.csr_matrix(Y)) results = trainSession.run() # for r in results: # print(r)
def main(): parser = argparse.ArgumentParser( description= 'pySMURFF - command line utility to the SMURFF Python module') parser.add_argument("command", help="Do full 'run' or only 'save' to .h5", choices=['run', 'save']) group = parser.add_argument_group("General parameters") group.add_argument("--version", action="store_true", help="print version info (and exit)") group.add_argument("--verbose", metavar="NUM", type=int, default=1, help="verbose output (default = 1}") group.add_argument("--ini", metavar="FILE", type=str, help="read options from this .ini file") group.add_argument("--num-threads", metavar="NUM", type=int, help="number of threads (0 = default by OpenMP") group.add_argument("--seed", metavar="NUM", type=int, help="random number generator seed") group = parser.add_argument_group("Used during training") group.add_argument("--train", metavar="FILE", type=str, help="train data file") group.add_argument("--test", metavar="FILE", type=str, help="test data") group.add_argument("--row-features", metavar="FILE", type=str, help="sparse/dense row features") group.add_argument("--col-features", metavar="FILE", type=str, help="sparse/dense column features") group.add_argument( "--prior", metavar="NAME", nargs=2, type=str, help= "provide a prior-type for each dimension of train; prior-types: <normal|normalone|spikeandslab|macau|macauone>" ) group.add_argument("--burnin", metavar="NUM", type=int, help="number of samples to discard") group.add_argument("--nsamples", metavar="NUM", type=int, help="number of samples to collect") group.add_argument("--num-latent", metavar="NUM", type=int, help="number of latent dimensions") group.add_argument( "--threshold", metavar="NUM", type=float, help="threshold for binary classification and AUC calculation") group = parser.add_argument_group("Storing models and predictions") group.add_argument("--restore-from", metavar="FILE", type=str, help="restore trainSession from a saved .h5 file") group.add_argument("--save-name", metavar="FILE", type=str, help="save model and/or predictions to this .h5 file") group.add_argument( "--save-freq", metavar="NUM", type=int, help="save every n iterations (0 == never, -1 == final model)") group.add_argument( "--checkpoint-freq", metavar="NUM", type=int, help="save state every n seconds, only one checkpointing state is kept" ) args = parser.parse_args() print(args) if args.version: print("SMURFF %s" % smurff.version) exit session = smurff.TrainSession() if args.ini is not None: session = read_ini(args.ini) file_options = { "train": session.setTrain, "test": session.setTest, "row_features": lambda x: session.addSideInfo(0, x), "col_features": lambda x: session.addSideInfo(1, x), } for opt, func in file_options.items(): if opt in vars(args) and vars(args)[opt] is not None: fname = vars(args)[opt] data = mio.read_matrix(fname) func(data) other_options = { "verbose": session.setVerbose, "num_threads": session.setNumThreads, "seed": session.setRandomSeed, "prior": session.setPriorTypes, "burnin": session.setBurnin, "nsamples": session.setNSamples, "num_latent": session.setNumLatent, "threshold": session.setThreshold, "restore_from": session.setRestoreName, "save_name": session.setSaveName, "save_freq": session.setSaveFreq, "checkpoint-freq": session.setCheckpointFreq, } print(vars(args)) for opt, func in other_options.items(): if opt in vars(args) and vars(args)[opt] is not None: value = vars(args)[opt] print("processing opt:", opt, "with value", value) func(value) if args.command == "run": session.run() else: session.init() # init will validate and save
#!/usr/bin/env python import smurff import matrix_io as mio #load data ic50 = mio.read_matrix("chembl-IC50-346targets.mm") ic50_train, ic50_test = smurff.make_train_test(ic50, 0.2) ic50_threshold = 6. session = smurff.TrainSession( priors=['normal', 'normal'], num_latent=32, burnin=10, nsamples=10, # Using threshold of 6. to calculate AUC on test data threshold=ic50_threshold) ## using activity threshold pIC50 > 6. to binarize train data session.addTrainAndTest(ic50_train, ic50_test, smurff.ProbitNoise(ic50_threshold)) predictions = session.run() print("RMSE = %.2f" % smurff.calc_rmse(predictions)) print("AUC = %.2f" % smurff.calc_auc(predictions, ic50_threshold))