def LoadDataSet(self, dataSetType): s = Settings(); if dataSetType == "A": mat = scipy.io.loadmat(s.getBasePath() + s.getInterimPath() + s.getDatasetAFileName()); return mat['G0'][:, 0:s.sampleSize()]; else: logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()"); return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]];
def LoadDataSetClasses(self, dataSetType): s = Settings() if dataSetType == "A_train": variables = numpy.array( joblib.load('datasetA_raw/DatasetA_TrainClasses.joblib.pkl')) return variables elif dataSetType == "A_test": variables = numpy.array( joblib.load( 'datasetA_raw/DatasetA_ValidationClasses.joblib.pkl')) return variables elif dataSetType == "B_train": variables = numpy.array( joblib.load('datasetB_raw/DatasetB_TrainClasses.joblib.pkl')) return variables elif dataSetType == "B_test": variables = numpy.array( joblib.load( 'datasetB_raw/DatasetB_ValidationClasses.joblib.pkl')) return variables elif dataSetType == "C_train": variables = numpy.array( joblib.load('datasetC_raw/DatasetC_TrainClasses.joblib.pkl')) return variables elif dataSetType == "C_test": variables = numpy.array( joblib.load( 'datasetC_raw/DatasetC_ValidationClasses.joblib.pkl')) return variables else: print "INVALID INPUT" logWarning( "HARD CODED VALUE from DataSetLoaderLib.LoadDataSetClasses()") return [0, 1, 1, 1, 0, 1]
def CalculateSimilarity(self, theMatrix, aPart, cacheTopXPerPart): theMatrixTranspose = theMatrix.transpose() #convert from 88x1M to 1Mx88 matrix #we will be saving the corr matrix by parts in different files and use files for processing totalParts = theMatrix.shape[1] / aPart totalParts = totalParts if theMatrix.shape[ 1] % aPart == 0 else totalParts + 1 logInfo('going to run for totalParts = ' + str(totalParts) + " i.e. " + str(theMatrix.shape[1]) + " / " + str(aPart)) logDebug("theMatrix.size = " + str(theMatrix.shape)) globalHash = {} settings = Settings() #worked till 32000 but to make it completely divisible i did 1004004/36=27889 there is a hard limit on 2^27 processing at a time http://stackoverflow.com/questions/13187443/nvidia-cufft-limit-on-sizes-and-batches-for-fft-with-scikits-cuda #will need to calculate it using in a loop of 32 OR launch such 32 warps vectorCache = [] #numpy.array([], dtype=[('i', int), ('j', int), ('corr', float)]);#(-1,-1,0.0) for i in range(0, totalParts - 1): print("corr calculator i=" + str(i)) for j in range(i, totalParts - 1): logDebug("corr calculator j=" + str(j) + " out of " + str(totalParts - 1)) A = theMatrixTranspose[i * aPart:(i + 1) * aPart, :].tolist() ## Last part should not take full space if it is not required to do so B = theMatrixTranspose[j * aPart:(j + 1) * aPart, :].tolist() #logDebug("A = " + str(len(A))); #logDebug("B = " + str(len(B))); if settings.isLocalMachine(): Result = A[:][0:1000] else: Result = pearson_correlation(A, B) logInfo('going to concatenate vectorCache with the main list') newlist, localHash = self.ExtractTopCorrValues( Result, cacheTopXPerPart, aPart, globalHash, i, j) globalHash.update(localHash) vectorCache.extend(newlist) #http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression logInfo('concatenation done...') #return sorted list logInfo('going to sort... ') #logDebug(vectorCache); vectorCache.sort(key=lambda x: x[2], reverse=True) #axis=0, kind='quicksort', has 4 things (Fa, Fb, Corr, '') logInfo('Calculate Similarity done. ' + str(len(vectorCache))) return vectorCache
class DatasetLoader(object): s = Settings(); __fileBasePath = s.getBasePath(); def LoadDataSet(self, dataSetType): if dataSetType == "A": mat = scipy.io.loadmat(__fileBasePath + s.getInterimPath() + s.getDatasetAFileName()); return mat['G0']; else: return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]];
def LoadEnhancedDataSet(self, dataSetType): s = Settings() if dataSetType == "A": with open('objs.pickle.backup' ) as f: #the file will need to be f.pickle return pickle.load(f)[1] #this will need to be changed to [0] else: logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()") return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
def CalculateSimilarity(self, theMatrix, aPart, cacheTopXPerPart): theMatrixTranspose = theMatrix.transpose();#convert from 88x1M to 1Mx88 matrix #we will be saving the corr matrix by parts in different files and use files for processing totalParts = theMatrix.shape[1] / aPart; totalParts = totalParts if theMatrix.shape[1] % aPart ==0 else totalParts + 1; logInfo('going to run for totalParts = ' + str(totalParts) + " i.e. " + str(theMatrix.shape[1]) + " / " + str(aPart)); logDebug("theMatrix.size = " + str(theMatrix.shape)); globalHash = {}; settings = Settings(); #worked till 32000 but to make it completely divisible i did 1004004/36=27889 there is a hard limit on 2^27 processing at a time http://stackoverflow.com/questions/13187443/nvidia-cufft-limit-on-sizes-and-batches-for-fft-with-scikits-cuda #will need to calculate it using in a loop of 32 OR launch such 32 warps vectorCache = [];#numpy.array([], dtype=[('i', int), ('j', int), ('corr', float)]);#(-1,-1,0.0) for i in range (0, totalParts-1): print("i="+str(i)); for j in range(i, totalParts-1): logDebug("j=" + str(j)); A = theMatrixTranspose[i*aPart:(i+1)*aPart,:].tolist(); ## Last part should not take full space if it is not required to do so B = theMatrixTranspose[j*aPart:(j+1)*aPart,:].tolist(); logDebug("A = " + str(len(A))); logDebug("B = " + str(len(B))); if settings.isLocalMachine(): Result = A[:][0:1000]; else: Result = pearson_correlation(A, B) logInfo('going to concatenate vectorCache with the main list'); newlist, localHash = self.ExtractTopCorrValues(Result, cacheTopXPerPart, aPart, globalHash, i, j); globalHash.update(localHash); vectorCache.extend(newlist);#http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression logInfo('concatenation done...'); #return sorted list logInfo('going to sort... '); logDebug(vectorCache); vectorCache.sort(key=lambda x: x[2], reverse=True);#axis=0, kind='quicksort', has 4 things (Fa, Fb, Corr, '') logInfo('Calculate Similarity done. ' + str(len(vectorCache))); return vectorCache;
def LoadDataSet(self, dataSetType): s = Settings() if dataSetType == "A": variables = numpy.array( joblib.load('DatasetA_Validation.joblib.pkl')) return variables #mat = scipy.io.loadmat(s.getBasePath() + s.getInterimPath() + s.getDatasetAFileName()); #return mat['G0'][:, 0:s.sampleSize()]; elif dataSetType == "B_train": variables = numpy.array( joblib.load('DataSetBGSE24417MAQCIITraining_data.joblib.pkl')) return variables elif dataSetType == "B_test": variables = numpy.array( joblib.load( 'DataSetBGSE24417MAQCIIValidation_data.joblib.pkl')) return variables else: print "INVALID INPUT" logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()") return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
from GlobalUtils import * import numpy from LoadDataSetA import LoadDataSet from MachineSpecificSettings import Settings s = Settings() p=20 w=2; G = numpy.matrix((88,p), dtype=float) Result = numpy.zeros([p,p], dtype=float) logDebug ("going to start the loops now") LoadDataSet() for i in range (0, p/w): print("i=") print(i) for j in range(i, p/w): print("j=") print(j) A = G[:, i*w:(i+1)*w] B = G[:, j*w:(j+1)*w] if(i==j): R1 = numpy.corrcoef(A, B) Result[i*w:(i+1)*w, j*w:(j+1)*w] = R1 else: Result[j*w:(j+1)*w, i*w:(i+1)*w] = R1.transpose()