""" Experiment to compute the baseline predictive model using flat features """ import json import numpy as np from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression import sys sys.path.append("..") import sptenmat import tensorIO import predictionTools X, axisDict, classDict = tensorIO.loadSingleTensor("data/cms-tensor-{0}.dat") Y = np.array(classDict.values(), dtype='int') predModel = LogisticRegression(C=990000, penalty='l1', tol=1e-6) flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode testSize = 0.5 outfile = open("results/baseline-results.json", 'w') for seed in range(0, 1000, 100): ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) for train, test in ttss: trainY = Y[train] baseAUC, basePred = predictionTools.getAUC(predModel, flatX, Y, train, test) output = {"type": "baseline", "seed": seed, "auc": baseAUC}
data_values = nparr_data_by_pt[nnz].flatten() data_values = np.reshape(data_values, (len(data_values), 1)) nonzero_subs = np.zeros((len(data_values), num_dims)) nonzero_subs.dtype = 'int' for n in range(num_dims): nonzero_subs[:, n] = nnz[n] sparse_tensor_first_10_ruid = sptensor.sptensor(nonzero_subs, data_values) #save the tensor tensorIO.saveSingleTensor(sparse_tensor_first_10_ruid, axisDict, od_patClass_first_10_ruid, "htn-first10-tensor-{0}.dat") # ### LEFT OFF HERE: june 25, 6pm ################################################################## ## load the tensor ####### loaded_X, loaded_axisDict, loaded_classDict = tensorIO.loadSingleTensor("htn-first10-tensor-{0}.dat") ## do the decomposition ###### #store the data in "data" data = {'exptID': exptID, 'size': MSize, 'sparsity': AFill, "rank": R, "alpha": alpha, "gamma": gamma} def calculateValues(TM, M): fms = TM.greedy_fms(M) fos = TM.greedy_fos(M) nnz = tensorTools.countTensorNNZ(M) return fms, fos, nnz ##raw features #rawFeatures = predictionTools.createRawFeatures(X) startTime = time.time()#start time -- to time it ##factorization spntf_htn_first_10_ruid = SP_NTF.SP_NTF(loaded_X, R=R, alpha=alpha)
default=0.5) parser.add_argument("-g", '--gamma', nargs='+', type=float, help="gamma") parser.add_argument("-s", "--seed", type=int, help="random seed", default=0) args = parser.parse_args() inputFile = args.infile exptID = args.eid testSize = args.testSize innerIter = 10 outerIter = args.iter R = args.rank gamma = args.gamma alpha = args.alpha seed = args.seed X, axisDict, classDict = tensorIO.loadSingleTensor(inputFile) Y = np.array(classDict.values(), dtype='int') ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) predModel = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) output = { "expt": exptID, "iters": outerIter, "inner": innerIter, "R": R, "gamma": gamma, "alpha": alpha, "seed": seed
""" Experiment to compute the baseline predictive model using flat features """ import json import numpy as np from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression import sys sys.path.append("..") import sptenmat import tensorIO import predictionTools X, axisDict, classDict = tensorIO.loadSingleTensor("data/cms-tensor-{0}.dat") Y = np.array(classDict.values(), dtype='int') predModel = LogisticRegression(C=990000, penalty='l1', tol=1e-6) flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode testSize = 0.5 outfile = open("results/baseline-results.json", 'w') for seed in range(0, 1000, 100): ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) for train, test in ttss: trainY = Y[train] baseAUC, basePred = predictionTools.getAUC(predModel, flatX, Y, train, test) output = {"type": "baseline", "seed": seed, "auc": baseAUC } outfile.write(json.dumps(output) + '\n') outfile.close()
parser.add_argument("-t", "--testSize", type=float, help="test size", default=0.5) parser.add_argument("-g", '--gamma', nargs='+', type=float, help="gamma") parser.add_argument("-s", "--seed", type=int, help="random seed", default=0) args = parser.parse_args() inputFile = args.infile exptID = args.eid testSize = args.testSize innerIter = 10 outerIter = args.iter R = args.rank gamma = args.gamma alpha = args.alpha seed = args.seed X, axisDict, classDict = tensorIO.loadSingleTensor(inputFile) Y = np.array(classDict.values(), dtype='int') ttss = StratifiedShuffleSplit(Y, n_iter=1, test_size=testSize, random_state=seed) predModel = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) output = { "expt": exptID, "iters": outerIter, "inner": innerIter, "R": R, "gamma": gamma, "alpha": alpha, "seed": seed } for train, test in ttss: trainShape = list(X.shape) trainShape[0] = len(train) ## take the subset for training trainX = predictionTools.tensorSubset(X, train, trainShape) trainY = Y[train] ## create the raw features
data_values = nparr_data_by_pt[nnz].flatten() data_values = np.reshape(data_values, (len(data_values), 1)) nonzero_subs = np.zeros((len(data_values), num_dims)) nonzero_subs.dtype = 'int' for n in range(num_dims): nonzero_subs[:, n] = nnz[n] sparse_tensor_all_finite = sptensor.sptensor(nonzero_subs, data_values) #save the tensor tensorIO.saveSingleTensor(sparse_tensor_all_finite, axisDict, od_patClass_all_finite, "htn-allfinite-tensor-{0}.dat") # ### LEFT OFF HERE: june 25, 6pm ################################################################## ## load the tensor ####### loaded_X, loaded_axisDict, loaded_classDict = tensorIO.loadSingleTensor("htn-allfinite-tensor-{0}.dat") ## do the decomposition ###### #store the data in "data" data = {'exptID': exptID, 'size': MSize, 'sparsity': AFill, "rank": R, "alpha": alpha, "gamma": gamma} def calculateValues(TM, M): fms = TM.greedy_fms(M) fos = TM.greedy_fos(M) nnz = tensorTools.countTensorNNZ(M) return fms, fos, nnz ##raw features #rawFeatures = predictionTools.createRawFeatures(X) startTime = time.time()#start time -- to time it ##factorization spntf_htn_all_finite = SP_NTF.SP_NTF(loaded_X, R=R, alpha=alpha)