def loadDataset(identifier): d = DataSetLoader() x = d.LoadDataSet(identifier) print 'X', x.shape y = d.LoadDataSetClasses(identifier) print 'Y', y.shape y = numpy.transpose(y.astype(numpy.int64)) print 'Y', y.shape target = [] y = list(y) print "y before manual transform =", y for i in y: target.append(int(i)) print len(y) print y return x, y
def loadDataset(identifier): d = DataSetLoader() x = d.LoadDataSet(identifier) print 'X', x.shape y = d.LoadDataSetClasses(identifier) print 'Y', y.shape #y=numpy.transpose(y.astype(numpy.int64)) y = sklearn.utils.validation.column_or_1d(y, warn=True) print 'Y', y.shape target = [] y = list(y) print "y before manual transform =", y for i in y: target.append(int(i)) print len(y) print y return x, y
# check selected features print (feat_selector.support_) # check ranking of features print (feat_selector.ranking_) print (len(feat_selector.ranking_)) selected_indices=feat_selector.ranking_ # call transform() on X to filter it down to selected features X_filtered = feat_selector.transform(X) return [X_filtered,selected_indices] d = DataSetLoader(); x = d.LoadDataSet("B_train"); y=d.LoadDataSetClasses("B_train"); print y.shape y=numpy.transpose(y) print x.shape print y.shape target=[] y=list(y) for i in y: target.append(int(i)) print len(y) sizes=['10','50','100','150','200','250'] methods=['MRMR','JMI','JMIM'] for method in methods: for size in sizes:
from MachineSpecificSettings import Settings import scipy.io import numpy from DataSetLoaderLib import DataSetLoader import csv #Used for storing and loading the trained classifier from sklearn.externals import joblib print("") print("") print("") print("") #targets = numpy.array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1]) targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) variables = None d = DataSetLoader() variables = d.LoadDataSet("A") #variables = G[:,0:100]; indices = joblib.load('selected_indices_MRMR.joblib.pkl') variables = numpy.array(variables)[:, indices] #print variables .shape #print len(variables ) variables = variables """ convert an array to csv http://stackoverflow.com/questions/16482895/convert-a-numpy-array-to-a-csv-string-and-a-csv-string-back-to-a-numpy-array targetsString = ','.join(['%d' % num for num in targets[0]]) variablesString = ','.join(['%.5f' % num for num in variables[0]]) numpy.fromstring(targetsString, sep=',')
import numpy as np import random from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from MachineSpecificSettings import Settings from DataSetLoaderLib import DataSetLoader from sklearn.externals import joblib from evolutionary_search import EvolutionaryAlgorithmSearchCV y=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) d = DataSetLoader(); X_original = d.LoadDataSet("A"); paramgrid = {"kernel": ["rbf"], "C" : np.logspace(-9, 9, num=25, base=10), "gamma" : np.logspace(-9, 9, num=25, base=10)} sizes=['10','50','100','150','200','250'] methods=['MRMR','JMI','JMIM'] targets=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) for method in methods: for size in sizes: random.seed(1) X=X_original indices= joblib.load(method+' PICKLES/selected_indices_'+method+'.joblib.pkl') X=np.array(X)[:,indices] indices= joblib.load(method+' PICKLES/'+size+'-'+method+'.joblib.pkl') X=np.array(X)[:,indices] f=open('genetic/'+method+'-'+size+'.txt','w') print size
import scipy.io import numpy from DataSetLoaderLib import DataSetLoader import csv #Used for storing and loading the trained classifier from sklearn.externals import joblib print("") print("") print("") print("") variables = None targets = None d = DataSetLoader() variables = d.LoadDataSet("A") targets = d.LoadDataSetClasses("A") """ convert an array to csv http://stackoverflow.com/questions/16482895/convert-a-numpy-array-to-a-csv-string-and-a-csv-string-back-to-a-numpy-array targetsString = ','.join(['%d' % num for num in targets[0]]) variablesString = ','.join(['%.5f' % num for num in variables[0]]) numpy.fromstring(targetsString, sep=',') load a csv to an array http://stackoverflow.com/questions/13381815/python-csv-text-file-to-arrayi-j """ selected_indices = [] [subset, selected_indices] = SelectSubSetmRMR(variables, targets)
from DataSetLoaderLib import DataSetLoader from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn import metrics from sklearn import preprocessing from sklearn.ensemble import VotingClassifier sizes=['10','50','100','150','200','250'] methods=['MRMR','JMI','JMIM'] for method in methods: for size in sizes: print size print method import time d = DataSetLoader(); X_train= d.LoadDataSet("B_train"); y_train = d.LoadDataSetClasses("B_train"); print X_train.shape print y_train.shape #chaipee will fix it later on y_train=numpy.transpose(y_train) print y_train.shape targets=list(y_train) y_train=[] for i in targets: #print i y_train.append(int(i)) #print len(y_train) indices= joblib.load('datasetB_pickles/datasetB'+size+'-'+method+'.joblib.pkl')
if i+self.add_by>=545089: i+=1 values.append(i+self.add_by) joblib.dump(values,'selected_features/datasetC/selected_indices'+'_'+useMethod+'.joblib.pkl', compress=9) except: type, value, traceback = sys.exc_info() print('Error Occured %s: %s: %s' % (type, value, traceback)) threadLock.release() print len(values) print "Exiting " + self.name return threads=[] d = DataSetLoader(); G = d.LoadDataSet("C_train"); targets = d.LoadDataSetClasses("C_train"); #chaipee will fix it later on #y_train=numpy.transpose(targets) y_train=numpy.asarray(targets) print y_train.shape targets=list(y_train) y_train=[] for i in targets: print i y_train.append(int(i)) targets = y_train #targets =numpy.asarray(targets ) #targets = column_or_1d(targets, warn=True)
#datasets = ["A","B"] classifiers = ["AdaBoost", "DT", "MLP", "SVM", "RandomForest", "ExtraTree"] dataset = "B" f = open('mcc/B-Full-mccResults' + dataset + '.txt', 'w') f.write( "dataset, size, method, classifier, validationTechnique, mc, timeTaken, extra info" ) for classifierName in classifiers: for method in methods: for size in sizes: for preproc in preprocessings: for validation in validationTechnique: #print size #print method d = DataSetLoader() X_train = d.LoadDataSet("B_train") y_train = d.LoadDataSetClasses("B_train") X_test = d.LoadDataSet("B_test") y_test = d.LoadDataSetClasses("B_test") #chaipee will fix it later on y_train = numpy.transpose(y_train) print y_train.shape targets = list(y_train) y_train = [] for i in targets: #print i y_train.append(int(i)) y_test = numpy.transpose(y_test)
i += 1 values.append(i + self.add_by) joblib.dump(values, 'selected_indices' + '_' + useMethod + '.joblib.pkl', compress=9) except: print "Error Occured" threadLock.release() print len(values) print "Exiting " + self.name return threads = [] d = DataSetLoader() G = d.LoadDataSet("B_train") targets = d.LoadDataSetClasses("B_train") print "Dataset loaded" G = numpy.asarray(G) targets = numpy.asarray(targets) threadLock = threading.Lock() print G.shape vals = 649 original = 649 for i in range(0, 1547): print "vals= " + str(vals) + "\n" # Create new threads thread = myThread(i, "Thread-" + str(i), vals - original,
#add headers sizes = ['10','50','100','150','200','250'] methods = ['MRMR','JMI','JMIM'] validationTechnique = ['LOOCV',"10FoldCV"] preprocessing = ['Standard','Imputer','Robust','Quantile'] #datasets = ["A","B"] classifiers = ["MLP","SVM","AdaBoost","DT","RandomForest","ExtraTree"] dataset = "A" f=open('mcc/mccResults'+dataset+'.txt','w'); f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, extra info"); for classifierName in classifiers: for method in methods: for size in sizes: for preproc in preprocessing: d = DataSetLoader(); X_train= d.LoadDataSet(dataset); y_train = d.LoadDataSetClasses(dataset); #print X_train.shape #print y_train.shape #chaipee will fix it later on y_train=numpy.transpose(y_train) #print y_train.shape targets=list(y_train) y_train=[] for i in targets: #print i y_train.append(int(i)) #print len(y_train) #first run indices
sizes = ['10','50','100','150','200','250'] methods = ['MRMR','JMI','JMIM'] #validationTechnique = ['LOOCV',"10FoldCV"] -- NOT USED??? #preprocessing = ['','NP'] #datasets = ["A","B"] classifiers = ["MLP","SVM","AdaBoost","DT","RandomForest","ExtraTree"] f=open('mcc/mccResultsC.txt','w'); f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken"); for classifierName in classifiers: for method in methods: for size in sizes: #print size #print method d = DataSetLoader(); X_train= d.LoadDataSet("C_train"); y_train = d.LoadDataSetClasses("C_train"); X_test= d.LoadDataSet("C_test"); y_test = d.LoadDataSetClasses("C_test"); #chaipee will fix it later on y_train=numpy.transpose(y_train) print y_train.shape targets=list(y_train) y_train=[] for i in targets: #print i y_train.append(int(i)) y_test=numpy.transpose(y_train)
#Different feature selection methods datasets = ['B'] #,'A' methods = ['MRMR'] #,'JMI','JMIM' sizes = ['10'] #,'50','100','150','200','250' validationTechniques = ["10FoldCV"] #"LOOCV", preps = ["Standard", "Robust", "Quantile", "Imputer"] #Iterating over each method print("Dataset, prepType, validationTechnique, method, size") for dataset in datasets: # f=open('mcc/mccEnsembleResults.txt','w'); # f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing"); print "Dataset = ", dataset #initiating datasetloader object d = DataSetLoader() #loading relevant Data and coresponding labels of dataset A X_train = d.LoadDataSet(dataset + "_train") y_train = d.LoadDataSetClasses(dataset + "_train") X_test = d.LoadDataSet(dataset + "_test") y_test = d.LoadDataSetClasses(dataset + "_test") print("Dimensions of validation data and labels:", X_test.shape, y_test.shape) #chaipee will fix it later on targets = list(numpy.transpose(y_train)) y_train = [] for i in targets: y_train.append(int(i))
def main(): """ for each of the subsets s of indexes from 0-1004003 of length between 1 and 10 #use biological info of known genes and mutual information and top down level of tree nodes [top down means ok interpretation and vague idea of root cause. bottom up means poor interpretation but pin pointed root cause identification] #check which subset is the best one by sorting them on desc order of error and then reliability create d as vertical projection of dataset using s indexes only for partition = 1 to length-2 create trainingSet of size partition create testSet of size length-partition calculate Error Rate & Reliability using CV10 calculate avg Error Rate and Avg Reliability pick the best """ print("\n\n\n\n\n") datasetLoader = DataSetLoader() setSize = 3 CVSetting = 2 classLabels = [] enhancedGeneSet = [] classLabels.extend(datasetLoader.GetClassLabels("A")) enhancedGeneSet.extend(datasetLoader.LoadEnhancedDataSet("A")) enhancedGeneSet = np.array(enhancedGeneSet) logInfo('Loaded the datasets') for s in range(1, 1 + setSize): for i in range(0, np.array(enhancedGeneSet).shape[1]): allCombinations = combinations( range(1, 1 + enhancedGeneSet.shape[1] - 1), s) #TODO: go from 1 to setSize and for the selected top X from amongst one level, make sure the next level subset contains them as prefix so we logInfo("allCombinations generated...") for aCombination in allCombinations: logDebug('aCombination') logDebug(aCombination) #on this combination, perform LOOCV (Leave one out cross validation) tempDataSet = enhancedGeneSet[:, aCombination[:]] logDebug('temp Data Set ') logDebug(tempDataSet.shape) logInfo('going to partition the tempDataSet') logDebug(tempDataSet.shape[0]) for partition in range(CVSetting, 1 + tempDataSet.shape[0]): #Using CV1 logDebug("Partition") logDebug(partition) trainingLabels = classLabels[0:partition] trainingSet = tempDataSet[0:partition, :] logDebug('training set') logDebug(trainingSet.shape) testSet = tempDataSet[partition:tempDataSet.shape[0], :] testLabels = classLabels[partition:tempDataSet.shape[0]] logDebug('test set') logDebug(testSet.shape) print(trainingLabels) print(trainingSet) classifier = Train(trainingSet, trainingLabels) errorRate, reliability, jScore = Evaluate( classifier, tempDataSet, testSet, testLabels, 1) print errorRate, ";", reliability, ";", jScore return
from sklearn.externals import joblib import numpy from MachineSpecificSettings import Settings import scipy.io from DataSetLoaderLib import DataSetLoader from sklearn.metrics import accuracy_score import time y = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) x = DataSetLoader() x = x.LoadDataSet("A") train_p = 0 train_n = 0 test_p = 0 test_n = 0 total = 0 x_test = [] y_test = [] x_train = [] y_train = [] for i in range(0, len(y)): if y[i] == 1: if train_p < 26: x_train.append(x[i]) y_train.append(y[i]) train_p += 1 if test_p < 28: x_test.append(x[i]) y_test.append(y[i]) test_p += 1
datasets = ['C', 'B', 'A'] methods = ['MRMR', 'JMI', 'JMIM'] sizes = ['10', '50', '100', '150', '200', '250'] classifiers = ["RandomForest", "AdaBoost", "DT", "ExtraTree", "MLP", "SVM"] validationTechniques = ["10FoldCV"] #"LOOCV", preps = ["Standard", "Robust", "Quantile", "Imputer"] basePath = '' #needed when we want to run it locally #Iterating over each method for dataset in datasets: f = open('mcc/mccResults' + dataset + '.txt', 'a') f.write('\n{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now())) #f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing"); print "Dataset = ", dataset #initiating datasetloader object d = DataSetLoader() #loading relevant Data and coresponding labels of dataset A X_train_full = d.LoadDataSet(dataset + "_train") y_train = d.LoadDataSetClasses(dataset + "_train") X_validate_full = d.LoadDataSet(dataset + "_test") y_validate = d.LoadDataSetClasses(dataset + "_test") print("Dimensions of training data and labels:", X_train_full.shape, y_train.shape) print("Dimensions of validation data and labels:", X_validate_full.shape, y_validate.shape) #READY with Dataset, going to perform the main loop now for method in methods: #Iterating over each size
from DataSetLoaderLib import DataSetLoader from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn import metrics from sklearn import preprocessing from sklearn.ensemble import VotingClassifier from sklearn.model_selection import LeaveOneOut sizes = ['10', '50', '100', '150', '200', '250'] methods = ['MRMR', 'JMI', 'JMIM'] for method in methods: for size in sizes: print size print method import time d = DataSetLoader() X_train = d.LoadDataSet("A") y_train = d.LoadDataSetClasses("A") print X_train.shape print y_train.shape #chaipee will fix it later on y_train = numpy.transpose(y_train) print y_train.shape targets = list(y_train) y_train = [] for i in targets: #print i y_train.append(int(i)) #print len(y_train) #first run indices indices = joblib.load('datasetA_pickles/selected_indices_' + method +
Store the ensemble outputs to basePath + "\Infiltration_ensembles\Dataset.lig.csv" ''' Datasets = ["C"] #"B","A", #Dataset ="B" #for testing purposes for Dataset in Datasets: if (eval("len(" + Dataset + "_LIG_Accuracies) != len(" + Dataset + "_LIGs)")): print Dataset + "_LIG mismatches the accuracies list" actuals = "" results = "" LIGs = eval(Dataset + "_LIGs") LIG_Accuracies = eval(Dataset + "_LIG_Accuracies") start_time = time.time() padding = 0 #load the dataset d = DataSetLoader() #X_train_full = d.LoadDataSet(Dataset+"_train"); #y_train = d.LoadDataSetClasses(Dataset+"_train"); #targets=list(numpy.transpose(y_train)) #y_train=[] #for i in targets: # y_train.append(int(i)) X_validate_full = d.LoadDataSet(Dataset + "_test") y_validate = d.LoadDataSetClasses(Dataset + "_test") print("Dimensions of validation data and labels:", X_validate_full.shape, y_validate.shape) targets = list(numpy.transpose(y_validate)) y_validate = [] if Dataset == "C": y_validate = numpy.array(targets)
#Used for storing and loading the trained classifier from sklearn.externals import joblib import numpy from MachineSpecificSettings import Settings import scipy.io from DataSetLoaderLib import DataSetLoader from sklearn.ensemble import ExtraTreesClassifier from sklearn import metrics print("") print("") print("") print("") targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) d = DataSetLoader() G = d.LoadDataSet("A") indices = joblib.load('selected_indicesv2.joblib.pkl') result = numpy.array(G)[:, indices] clf = ExtraTreesClassifier() import time start_time = time.time() scores = cross_val_score(clf, result, targets, cv=10) end_time = time.time() - start_time print end_time for i in scores: print i print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) filename = 'ExtraTreesClassifier_k-fold.joblib.pkl' joblib.dump(clf, filename, compress=9)
def performTreeletClustering(DatasetName): saveFreq = 1000 #temp value for i x = -1 if (not (os.path.isfile("objs.pickle"))): print "New Start" d = DataSetLoader() G = d.LoadDataSet(DatasetName) F = G M = [] cacheTopXPerPart = d.CacheTopXPerPart(DatasetName) corrCalculator = PairwisePearsonCorrelationCalculator() print "calling corr calculator" corrMatrix = corrCalculator.CalculateSimilarity( G, d.GetPartSize(DatasetName), cacheTopXPerPart) else: print "continuing from where we left off" d = DataSetLoader() G, F, M, x, corrMatrix, cacheTopXPerPart = read() corrCalculator = PairwisePearsonCorrelationCalculator() p = F[0, :].size #because we have already done the previous iteration and loaded that one #save(G,F,M,i,corrMatrix,cacheTopXPerPart) i = x + 1 print corrMatrix[0] while i < p: #for i in range (x+1, p): recalc = False if checkCorr(corrMatrix, p) == 0: print "ERROR IN CORRMATRIX INDEX" return 0 #calculating value of p p = F[0, :].size print "Value of i is : " + str(i) + " out of " + str(p) theVectors = corrMatrix[0] #this is always the max corr so the element we want to process try: if (corrMatrix[0][3] == ''): recalc = False else: print corrMatrix[0] recalc = True except: pass Fa = F[:, theVectors[0]] Fb = F[:, theVectors[1]] print "calling generate metagene" m = generateNewMetaGene(Fa, Fb) print "calling scipy delete on F" F = scipy.delete(F, theVectors[1], 1) if not len(M): #if this is the first meta gene in this matrix M = m else: M = numpy.column_stack( (m, M)) #include in the meta genes set as well corrMatrix.pop(0) corrMatrix = corrCalculator.UpdateSimilarity(corrMatrix, F, list(m), theVectors[0], theVectors[1]) F[:, theVectors[0]] = m if len( corrMatrix ) <= 0 or recalc == True: #everything after this is potentially incorrect so lets recalculate the matrix corrMatrix = corrCalculator.CalculateSimilarity( F, d.GetPartSize(DatasetName), cacheTopXPerPart) if i % saveFreq == 0: save(G, F, M, i, corrMatrix, cacheTopXPerPart) i += 1 F = numpy.column_stack(( G, M)) #scipy.append(G, M, 1) #define a new expanded featureset F = G U M return F