def evaluatePredictionAUC(self): run = 0 sumBaseAUC=0.0 sumCprAUC=0.0 for train, test in self.ttss: print "Evaluating Run:{0}".format(run) # get the indices for the training tensor trainShape = list(self.X.shape) trainShape[0] = len(train) trainX = tensorSubset(self.X, train, trainShape) trainY = self.Y[train] ## find the tensor factors for PTF-HT klp = self.findFactors(trainX) ## Get the reduced features for the data points ptfFeat = klp.projectSlice(self.X, 0) ## Evaluate the raw fit using logistic regression baseAUC, basePred = predictionTools.getAUC(self.predModel, self.rawFeatures, self.Y, train, test) cprAUC, cprPred = predictionTools.getAUC(self.predModel, ptfFeat, self.Y, train, test) sumBaseAUC+=baseAUC sumCprAUC+=cprAUC run = run + 1 print sumBaseAUC/run print('**************************************') print sumCprAUC/run return sumBaseAUC/run,sumCprAUC/run
def evaluatePredictionAUC_2(self,experCount,Demog): run = 0 sumBaseAUC=0.0 sumCprAUC=0.0 lambda1=1 lambda4=1 DemoU=np.random.rand(self.R,Demog.shape[1]) MCPR, cpstats, mstats = cp_apr_demog.cp_apr(self.X, self.R,Demog,DemoU,lambda1,lambda4, maxiters=40, maxinner=self.innerIter) MCPR.normalize_sort(1) ## scale by summing across the rows totWeight = np.sum(MCPR.U[0], axis=1) zeroIdx = np.where(totWeight < 1e-100)[0] if len(zeroIdx) > 0: # for the zero ones we're going to evenly distribute evenDist = np.repeat(1.0 / self.R, len(zeroIdx)*self.R) MCPR.U[0][zeroIdx, :] = evenDist.reshape((len(zeroIdx), self.R)) totWeight = np.sum(MCPR.U[0], axis=1) twMat = np.repeat(totWeight, self.R).reshape(self.X.shape[0], self.R) MCPR.U[0] = MCPR.U[0] / twMat #print(MCPR.U[0]) #print(self.rawFeatures) rawXfile=self.data_dir+'experimentDemo/rawdataX_'+str(experCount)+'.csv' rawYfile=self.data_dir+'experimentDemo/rawdataY_'+str(experCount)+'.csv' cprXfile=self.data_dir+'experimentDemo/cprdataX_'+str(experCount)+'.csv' cprYfile=self.data_dir+'experimentDemo/cprdataY_'+str(experCount)+'.csv' np.savetxt(rawXfile,self.rawFeatures) np.savetxt(rawYfile,self.Y) np.savetxt(cprXfile, MCPR.U[0]) np.savetxt(cprYfile,self.Y) for train, test in self.ttss: print "Evaluating Run:{0}".format(run) # get the indices for the training tensor trainShape = list(self.X.shape) trainShape[0] = len(train) trainX = tensorSubset(self.X, train, trainShape) trainY = self.Y[train] ## Evaluate the raw fit using logistic regression baseAUC, basePred = predictionTools.getAUC(self.predModel, self.rawFeatures, self.Y, train, test) cprAUC, cprPred = predictionTools.getAUC(self.predModel, MCPR.U[0], self.Y, train, test) sumBaseAUC+=baseAUC sumCprAUC+=cprAUC print('base:'+str(baseAUC)) print('apr:'+str(cprAUC)) run = run + 1 print('**************************************') print sumBaseAUC/run print sumCprAUC/run return sumBaseAUC/run,sumCprAUC/run
import json import numpy as np from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression import sys sys.path.append("..") import sptenmat import tensorIO import predictionTools X, axisDict, classDict = tensorIO.loadSingleTensor("data/cms-tensor-{0}.dat") Y = np.array(classDict.values(), dtype='int') predModel = LogisticRegression(C=990000, penalty='l1', tol=1e-6) flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode testSize = 0.5 outfile = open("results/baseline-results.json", 'w') for seed in range(0, 1000, 100): ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) for train, test in ttss: trainY = Y[train] baseAUC, basePred = predictionTools.getAUC(predModel, flatX, Y, train, test) output = {"type": "baseline", "seed": seed, "auc": baseAUC} outfile.write(json.dumps(output) + '\n') outfile.close()
## store off the raw file MFact[0].writeRawFile("results/pred-raw-marble-{0}.dat".format(exptID)) MFact[1].writeRawFile( "results/pred-raw-bias-marble-{0}.dat".format(exptID)) ## compare to the traditional non-negative startTime = time.time() MCPR, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=innerIter) cpaprElapse = time.time() - startTime MCPR.writeRawFile("results/pred-raw-cpapr-{0}.dat".format(exptID)) MCPR.normalize_sort(1) klp = KLProjection.KLProjection(MCPR.U, MCPR.R) cprFeat = klp.projectSlice(X, 0) ## prediction part baseAUC, basePred = predictionTools.getAUC(predModel, rawFeatures, Y, train, test) marbleAUC, marblePred = predictionTools.getAUC(predModel, pftMat, Y, train, test) cprAUC, cprPred = predictionTools.getAUC(predModel, cprFeat, Y, train, test) output['time'] = [0, cpaprElapse, marbleElapse] output['auc'] = [baseAUC, cprAUC, marbleAUC] output['order'] = ['Baseline', 'CP-APR', 'Marble'] with open("results/pred-{0}.json".format(exptID), 'w') as outfile: json.dump(output, outfile)
""" Experiment to compute the baseline predictive model using flat features """ import json import numpy as np from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression import sys sys.path.append("..") import sptenmat import tensorIO import predictionTools X, axisDict, classDict = tensorIO.loadSingleTensor("data/cms-tensor-{0}.dat") Y = np.array(classDict.values(), dtype='int') predModel = LogisticRegression(C=990000, penalty='l1', tol=1e-6) flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode testSize = 0.5 outfile = open("results/baseline-results.json", 'w') for seed in range(0, 1000, 100): ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) for train, test in ttss: trainY = Y[train] baseAUC, basePred = predictionTools.getAUC(predModel, flatX, Y, train, test) output = {"type": "baseline", "seed": seed, "auc": baseAUC } outfile.write(json.dumps(output) + '\n') outfile.close()
## create the raw features rawFeatures = predictionTools.createRawFeatures(X) startTime = time.time() MFact, Minfo = SP_NTF.sp_ntf(trainX, R=R, alpha=alpha, gamma=gamma, maxiters = outerIter, maxinner=innerIter) marbleElapse = time.time() - startTime pftMat, pftBias = SP_NTF.projectTensor(X, MFact, 0, maxinner=innerIter) ## store off the raw file MFact[0].writeRawFile("results/pred-raw-marble-{0}.dat".format(exptID)) MFact[1].writeRawFile("results/pred-raw-bias-marble-{0}.dat".format(exptID)) ## compare to the traditional non-negative startTime = time.time() MCPR, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=innerIter) cpaprElapse = time.time() - startTime MCPR.writeRawFile("results/pred-raw-cpapr-{0}.dat".format(exptID)) MCPR.normalize_sort(1) klp = KLProjection.KLProjection(MCPR.U, MCPR.R) cprFeat = klp.projectSlice(X, 0) ## prediction part baseAUC, basePred = predictionTools.getAUC(predModel, rawFeatures, Y, train, test) marbleAUC, marblePred = predictionTools.getAUC(predModel, pftMat, Y, train, test) cprAUC, cprPred = predictionTools.getAUC(predModel, cprFeat, Y, train, test) output['time'] = [0, cpaprElapse, marbleElapse] output['auc'] = [baseAUC, cprAUC, marbleAUC] output['order'] = ['Baseline', 'CP-APR', 'Marble'] with open("results/pred-{0}.json".format(exptID), 'w') as outfile: json.dump(output, outfile)