def useHier(topX, regX, R, hierIters, hierInner, regIters, regInner, tensorInfo): topY1, top1stats, top1mstats = CP_APR.cp_apr(topX, R, maxiters=hierIters, maxinner=hierInner) # reduce them to probability and then just sort them topY1.normalize_sort(1) topY1 = pmdTools.zeroSmallFactors(topY1, 1e-4) ### Use the factors to populate the factors Udiag = np.zeros((len(tensorInfo['diag']), R)) Umed = np.zeros((len(tensorInfo['med']), R)) ### Patient factors stays the same for idx, diag in enumerate(tensorInfo['diag']): topDiagIdx = tensorInfo['diagHier'][diag] diagCount = tensorInfo['diagHierCount'][topDiagIdx] Udiag[idx, :] = topY1.U[1][topDiagIdx, :] / diagCount for idx, med in enumerate(tensorInfo['med']): topMedIdx = tensorInfo['medHier'][med] medCount = tensorInfo['medHierCount'][topMedIdx] Umed[idx, :] = topY1.U[2][topMedIdx, :] / medCount Mtop = ktensor.ktensor(np.ones(R), [topY1.U[0].copy(), Udiag, Umed]) Y1, ystats, mstats = CP_APR.cp_apr(X1, R, Minit=Mtop, maxiters=regIters, maxinner=regInner) return Y1, topY1, top1stats, top1mstats, ystats, mstats
def evaluatePredictionAUC_1(self,experCount): run = 0 sumBaseAUC=0.0 sumCprAUC=0.0 MCPR, cpstats, mstats = CP_APR.cp_apr(self.X, self.R, maxiters=1, maxinner=7) #MCPR.normalize_sort(1) MCPR.redistribute(0) ## scale by summing across the rows totWeight = np.sum(MCPR.U[0], axis=1) zeroIdx = np.where(totWeight < 1e-100)[0] if len(zeroIdx) > 0: # for the zero ones we're going to evenly distribute evenDist = np.repeat(1.0 / self.R, len(zeroIdx)*self.R) MCPR.U[0][zeroIdx, :] = evenDist.reshape((len(zeroIdx), self.R)) totWeight = np.sum(MCPR.U[0], axis=1) twMat = np.repeat(totWeight, self.R).reshape(self.X.shape[0], self.R) MCPR.U[0] = MCPR.U[0] / twMat rawXfile=self.data_dir+'experiment_runprecess/rawdataX_'+str(experCount)+'.csv' rawYfile=self.data_dir+'experiment_runprecess/rawdataY_'+str(experCount)+'.csv' cprXfile=self.data_dir+'experiment_runprecess/cprdataX_'+str(experCount)+'.csv' cprYfile=self.data_dir+'experiment_runprecess/cprdataY_'+str(experCount)+'.csv' np.savetxt(rawXfile,self.rawFeatures) np.savetxt(rawYfile,self.Y) np.savetxt(cprXfile, MCPR.U[0]) np.savetxt(cprYfile,self.Y)
def factorTensor(X): # set the seed for the same initialization np.random.seed(seed) Y, iterStats, mstats = CP_APR.cp_apr(X, R, tol=tol, maxiters=outerIters, maxinner=innerIters) Y.normalize_sort(1) Y = decompTools.zeroSmallFactors(Y, zeroThr=zeroThr) return Y
def findFactors(self, trainX, zeroThr=1e-4): """ Find the factor basis for this tensor """ M, cpstats, mstats = CP_APR.cp_apr(trainX, R=self.R, maxiters=self.outerIter, maxinner=self.innerIter) M.normalize_sort(1) # zero out the small factors for n in range(M.ndims()): zeroIdx = np.where(M.U[n] < zeroThr) M.U[n][zeroIdx] = 0 return KLProjection.KLProjection(M.U, self.R)
def findFactors(X, R=100, outerIter=70, innerIter=10, zeroThr=1e-4): """ Find the factor basis for this tensor """ M, cpstats, mstats = CP_APR.cp_apr(X, R=R, maxiters=outerIter, maxinner=innerIter) M.normalize_sort(1) M = decompTools.zeroSmallFactors(M, zeroThr) return KLProjection.KLProjection(M.U, R), M, mstats
def useHier(topX, regX, R, hierIters, hierInner, regIters, regInner, tensorInfo): topY1, top1stats, top1mstats = CP_APR.cp_apr(topX, R, maxiters=hierIters, maxinner=hierInner) # reduce them to probability and then just sort them topY1.normalize_sort(1) topY1 = pmdTools.zeroSmallFactors(topY1, 1e-4) ### Use the factors to populate the factors Udiag = np.zeros((len(tensorInfo['diag']), R)) Umed = np.zeros((len(tensorInfo['med']), R)) ### Patient factors stays the same for idx, diag in enumerate(tensorInfo['diag']): topDiagIdx = tensorInfo['diagHier'][diag] diagCount = tensorInfo['diagHierCount'][topDiagIdx] Udiag[idx,:] = topY1.U[1][topDiagIdx,:] / diagCount for idx, med in enumerate(tensorInfo['med']): topMedIdx = tensorInfo['medHier'][med] medCount = tensorInfo['medHierCount'][topMedIdx] Umed[idx,:] = topY1.U[2][topMedIdx,:] / medCount Mtop = ktensor.ktensor(np.ones(R), [topY1.U[0].copy(), Udiag, Umed]) Y1, ystats, mstats = CP_APR.cp_apr(X1, R, Minit=Mtop, maxiters=regIters, maxinner=regInner) return Y1, topY1, top1stats, top1mstats, ystats, mstats
def decomposeCountTensor(filename, R, outerIters=20, innerIters=10, convergeTol=1e-2, zeroTol=1e-4): """ Given a file, load the tensor data and then From a file, load the tensor data and then decompose using CP_APR with specified rank Parameters: filename - the file that stores the sparse tensor representation using numpy R - the rank of the tensor outerIters - the maximum number of outer iterations innerIters - the maximum number of inner iterations convergeTol - the convergence tolerance zeroTol - the amount to zero out the factors Output: """ X = sptensor.loadTensor(filename) Y, iterStats, modelStats = CP_APR.cp_apr(X, R, tol=convergeTol, maxiters=outerIters, maxinner=innerIters) # normalize the factors using the 1 norm and then sort in descending order Y.normalize_sort(1) Y = zeroSmallFactors(Y, zeroThr=zeroTol) return Y, iterStats, modelStats
client = MongoClient() db = client.gravel exptDB = db.factor ## verify the experimentID is okay if exptDB.find({"id": exptID}).count(): print "Experiment ID already exists, select another" return print "Starting Tensor Factorization with ID:{0}".format(exptID) np.random.seed(seed) ## factorize using CP_APR (this is the original) Y, iterStats, modelStats = CP_APR.cp_apr(X, R, tol=tol, maxiters=outerIters, maxinner=innerIters) ## Y.writeRawFile("results/apr-raw-{0}.dat".format(exptID)) Youtfile = "results/apr-db-{0}-{1}.csv".format(exptID, iter) Ysqlfile = "results/apr-sql-{0}.sql".format(exptID) # save the decomposition into the format Yout = decompTools.getDBOutput(Y, yaxis) Yout = np.column_stack((np.repeat(exptID, Yout.shape[0]), Yout)) np.savetxt(Youtfile, Yout, fmt="%s", delimiter="|") sqlOut = file(Ysqlfile, "w") sqlOut.write(
compOut = [] for train, test in ttss: if n != nSample: n = n + 1 continue else: trainShape = list(X.shape) train[0] = len(train) trainX = predictionModel.tensorSubset(X, train, trainShape) ## Do the tensor factorization np.random.seed(seed) startTime = time.time() M, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=10) M.normalize_sort(1) # zero out the small factors for n in range(1, 2): zeroIdx = np.where(M.U[n] < zeroThr) M.U[n][zeroIdx] = 0 elapsed = time.time() - startTime compOut.append({ "expt": exptID, "R": R, "Outer": outerIter, "Model": "Limestone", "Comp": elapsed })
## connection to mongo-db client = MongoClient() db = client.gravel exptDB = db.factor ## verify the experimentID is okay if exptDB.find({"id": exptID}).count(): print "Experiment ID already exists, select another" return print "Starting Tensor Factorization with ID:{0}".format(exptID) np.random.seed(seed) ## factorize using CP_APR (this is the original) Y, iterStats, modelStats = CP_APR.cp_apr(X, R, tol=tol, maxiters=outerIters, maxinner=innerIters) ## Y.writeRawFile("results/apr-raw-{0}.dat".format(exptID)) Youtfile = "results/apr-db-{0}-{1}.csv".format(exptID, iter) Ysqlfile = "results/apr-sql-{0}.sql".format(exptID) # save the decomposition into the format Yout = decompTools.getDBOutput(Y, yaxis) Yout = np.column_stack((np.repeat(exptID, Yout.shape[0]), Yout)) np.savetxt(Youtfile, Yout, fmt="%s", delimiter="|") sqlOut = file(Ysqlfile, "w") sqlOut.write("load data local infile '/home/joyce/workspace//Health/analysis/tensor/{0}' into table tensor_factors fields terminated by '|' ;\n".format(Youtfile)) sqlOut.write("insert into tensor_models(expt_ID, label_ID, description, rank, iterations, inner_iterations, seed, least_squares, log_likelihood, kkt_violation) values({0}, {1}, \'{2}\', {3}, {4}, {5}, {6}, {7}, {8}, {9});\n".format(exptID, labelID, exptDesc, R, iter, innerIter, seed, mstats['LS'], mstats['LL'], mstats['KKT']))
startTime = time.time() spntf = SP_NTF.SP_NTF(X, R=R, alpha=alpha, maxinner=INNER_ITER, maxiters=MAX_ITER) Yinfo = spntf.computeDecomp(gamma=gamma) ## calculate all the request entries marbleElapse = time.time() - startTime marbleFMS, marbleFOS, marbleNNZ = calculateValues( TM, spntf.M[SP_NTF.REG_LOCATION]) np.random.seed(seed) startTime = time.time() YCP, ycpstats, mstats = CP_APR.cp_apr(X, R=R, maxinner=INNER_ITER, maxiters=MAX_ITER) cpaprElapse = time.time() - startTime cpaprFMS, cpaprFOS, cpaprNNZ = calculateValues(TM, YCP) for n in range(YCP.ndims()): YCP.U[n] = tensorTools.hardThresholdMatrix(YCP.U[n], gamma[n]) limestoneFMS, limestoneFOS, limestoneNNZ = calculateValues(TM, YCP) sampleResult = { "Order": ["Marble", "CPAPR", "Limestone"], "FMS": [marbleFMS, cpaprFMS, limestoneFMS], "FOS": [marbleFOS, cpaprFOS, limestoneFOS], "CompTime": [marbleElapse, cpaprElapse, cpaprElapse], "NNZ": [marbleNNZ, cpaprNNZ, limestoneNNZ] }
return fms, fos, nnz for sample in range(10): seed = sample*1000 np.random.seed(seed) ## solve the solution startTime = time.time() spntf = SP_NTF.SP_NTF(X, R=R, alpha=alpha, maxinner=INNER_ITER, maxiters=MAX_ITER) Yinfo = spntf.computeDecomp(gamma=gamma) ## calculate all the request entries marbleElapse = time.time() - startTime marbleFMS, marbleFOS, marbleNNZ = calculateValues(TM, spntf.M[SP_NTF.REG_LOCATION]) np.random.seed(seed) startTime = time.time() YCP, ycpstats, mstats = CP_APR.cp_apr(X, R=R, maxinner=INNER_ITER, maxiters=MAX_ITER) cpaprElapse = time.time() - startTime cpaprFMS, cpaprFOS, cpaprNNZ = calculateValues(TM, YCP) for n in range(YCP.ndims()): YCP.U[n] = tensorTools.hardThresholdMatrix(YCP.U[n], gamma[n]) limestoneFMS, limestoneFOS, limestoneNNZ = calculateValues(TM, YCP) sampleResult = { "Order": ["Marble", "CPAPR", "Limestone"], "FMS":[marbleFMS, cpaprFMS, limestoneFMS], "FOS":[marbleFOS, cpaprFOS, limestoneFOS], "CompTime": [marbleElapse, cpaprElapse, cpaprElapse], "NNZ": [marbleNNZ, cpaprNNZ, limestoneNNZ] } data[str(sample)] = sampleResult
import CP_APR import ktensor """ Test file associated with the CP decomposition using APR """ """ Test factorization of sparse matrix """ subs = np.array([[0, 3, 1], [1, 0, 1], [1, 2, 1], [1, 3, 1], [3, 0, 0]]) vals = np.array([[1], [1], [1], [1], [3]]) siz = np.array([5, 5, 2]) # 5x5x2 tensor X = sptensor.sptensor(subs, vals, siz) U0 = np.array([[0.7689, 0.8843, 0.7487, 0.0900], [0.1673, 0.5880, 0.8256, 0.1117], [0.8620, 0.1548, 0.7900, 0.1363], [0.9899, 0.1999, 0.3185, 0.6787], [0.5144, 0.4070, 0.5341, 0.4952]]) U1 = np.array([[0.1897, 0.5606, 0.8790, 0.9900], [0.4950, 0.9296, 0.9889, 0.5277], [0.1476, 0.6967, 0.0006, 0.4795], [0.0550, 0.5828, 0.8654, 0.8013], [0.8507, 0.8154, 0.6126, 0.2278]]) U2 = np.array([[0.4981, 0.5747, 0.7386, 0.2467], [0.9009, 0.8452, 0.5860, 0.6664]]) Minit = ktensor.ktensor(np.ones(4), [U0, U1, U2]) fms = Minit.fms(Minit) Y, cpstats, modelStats = CP_APR.cp_apr(X, 4, Minit=Minit, maxiters=100) Y.normalize_sort(1) """ Test factorization of regular matrix """ X = tensor.tensor(range(1, 25), [3, 4, 2]) print CP_APR.cp_apr(X, 4)
import numpy as np; import CP_APR import ktensor import KLProjection """ Test file associated with the CP decomposition using APR """ """ Test factorization of sparse matrix """ subs = np.array([[0,3,1], [1,0,1], [1,2,1], [1,3,1], [3,0,0]]); vals = np.array([[1],[1],[1],[1],[3]]); siz = np.array([5,5,2]) # 5x5x2 tensor X = sptensor.sptensor(subs, vals, siz) U0 = np.array([[0.7689, 0.8843, 0.7487, 0.0900], [0.1673, 0.5880, 0.8256, 0.1117], [0.8620, 0.1548, 0.7900, 0.1363], [0.9899, 0.1999, 0.3185, 0.6787], [0.5144, 0.4070, 0.5341, 0.4952]]) U1 = np.array([[0.1897, 0.5606, 0.8790, 0.9900], [0.4950, 0.9296, 0.9889, 0.5277], [0.1476, 0.6967, 0.0006, 0.4795], [0.0550, 0.5828, 0.8654, 0.8013], [0.8507, 0.8154, 0.6126, 0.2278]]) U2 = np.array([[0.4981, 0.5747, 0.7386, 0.2467], [0.9009, 0.8452, 0.5860, 0.6664]]) Minit = ktensor.ktensor(np.ones(4), [U0, U1, U2]) fms = Minit.fms(Minit) Y, cpstats, modelStats = CP_APR.cp_apr(X,4, Minit=Minit, maxiters=100); Y.normalize_sort(1) subs2 = np.array([[0,3,1], [1,2,0]]) vals2 = np.array([[1], [1]]) siz2 = np.array([2,5,2]) Xhat = sptensor.sptensor(subs2, vals2, siz2) klproj = KLProjection.KLProjection(Y.U, 4) np.random.seed(10) klproj.projectSlice(Xhat, 0)
diagIdx = idx / 470 medIdx = idx % 470 return axisList[1][diagIdx] + axisList[2][medIdx] for train, test in ttss: if n != nSample: n = n + 1 continue else: trainShape = list(X.shape) train[0] = len(train) trainX = predictionModel.tensorSubset(X, train, trainShape) ## Do the tensor factorization np.random.seed(seed) M, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=10) M.normalize_sort(1) M.writeRawFile(factorFile) Yout = decompTools.getDBOutput(M, yaxis) Yout = np.column_stack((np.repeat(exptID, Yout.shape[0]), Yout)) np.savetxt(Youtfile, Yout, fmt="%s", delimiter="|") sqlOut = file(Ysqlfile, "w") sqlOut.write("load data local infile '/home/joyce/workspace/Health/analysis/tensor/{0}' into table tensor_factors fields terminated by '|' ;\n".format(Youtfile)) sqlOut.write("insert into tensor_models(expt_ID, label_ID, description, rank, iterations, inner_iterations, seed, least_squares, log_likelihood, kkt_violation) values({0}, {1}, \'{2}\', {3}, {4}, {5}, {6}, {7}, {8}, {9});\n".format(exptID, labelID, exptDesc, R, outerIter, innerIter, seed, mstats['LS'], mstats['LL'], mstats['KKT'])) klp = KLProjection.KLProjection(M.U, M.R) ptfFeat = klp.projectSlice(X, 0) trainY = Y[train] predModel.fit(ptfFeat[train, :], trainY) ptfPred = predModel.predict_proba(ptfFeat[test,:]) fpr, tpr, thresholds = metrics.roc_curve(Y[test], ptfPred[:, 1], pos_label=1)
## calculate diagnosis-medication combination diagMed = [[a, b] for a, b in itertools.product(yaxis[1], yaxis[2])] def getDBEntry(featureName, m): output = np.zeros((1, 4)) for r in range(R): # get the nonzero indices idx = np.flatnonzero(m[:, r]) tmp = np.column_stack((np.array(diagMed)[idx], np.repeat(r, len(idx)), m[idx, r])) output = np.vstack((output, tmp)) output = np.delete(output, (0), axis=0) output = np.column_stack((np.repeat(exptID, output.shape[0]), np.repeat(featureName, output.shape[0]), output)) return output np.random.seed(seed) M, cpstats, mstats = CP_APR.cp_apr(X, R, maxiters=iters, maxinner=innerIter) M.normalize_sort(1) ## Threshold the values for n in range(1,2): zeroIdx = np.where(M.U[n] < modeThr) M.U[n][zeroIdx] = 0 ## Get the diagnosis-medication matrix ptfMatrix = khatrirao.khatrirao(M.U[1], M.U[2]) dbOutput = getDBEntry("CP-APR", ptfMatrix) flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode nmfModel = nimfa.mf(flatX, method="nmf", max_iter=iters, rank=R) nmfResult = nimfa.mf_run(nmfModel) nmfBasis = nmfResult.coef().transpose() nmfBasis = preprocessing.normalize(nmfBasis, norm="l1", axis=0) nmfBasis = nmfBasis.toarray()
flatX = sptenmat.sptenmat(xprime, [0]).tocsrmat() # matricize along the first mode stats = np.zeros((1,6)) ## NMF Timing for k in range(samples): startTime = time.time() nmfModel = nimfa.mf(flatX, method="nmf", max_iter=iters, rank=R) nmfResult = nimfa.mf_run(nmfModel) elapsed = time.time() - startTime stats = np.vstack((stats, np.array([R, iters, pn, k, "NMF", elapsed]))) ## PCA Timing for k in range(samples): startTime = time.time() pcaModel.fit(flatX) elapsed = time.time() - startTime stats = np.vstack((stats, np.array([R, iters, pn, k, "PCA", elapsed]))) ## Tensor factorization timing for k in range(samples): startTime = time.time() CP_APR.cp_apr(xprime, R, maxiters=iters) elapsed = time.time() - startTime stats = np.vstack((stats, np.array([R, iters, pn, k, "CP_APR", elapsed]))) stats = np.delete(stats, (0), axis=0) outFile = "results/patient-cpu-{0}.csv".format(pn) np.savetxt(outFile, stats, fmt="%s", delimiter="|") print "load data local infile '/home/joyce/workspace/Health/analysis/tensor/{0}' into table comp_metrics fields terminated by '|' ;\n".format(outFile)