def LoadDataSet(self, dataSetType):
     s = Settings();
     if dataSetType == "A":
         mat = scipy.io.loadmat(s.getBasePath() + s.getInterimPath() + s.getDatasetAFileName());
         return mat['G0'][:, 0:s.sampleSize()];
     else:
         logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()");
         return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]];
Пример #2
0
    def LoadDataSetClasses(self, dataSetType):
        s = Settings()
        if dataSetType == "A_train":
            variables = numpy.array(
                joblib.load('datasetA_raw/DatasetA_TrainClasses.joblib.pkl'))
            return variables
        elif dataSetType == "A_test":
            variables = numpy.array(
                joblib.load(
                    'datasetA_raw/DatasetA_ValidationClasses.joblib.pkl'))
            return variables
        elif dataSetType == "B_train":
            variables = numpy.array(
                joblib.load('datasetB_raw/DatasetB_TrainClasses.joblib.pkl'))
            return variables
        elif dataSetType == "B_test":
            variables = numpy.array(
                joblib.load(
                    'datasetB_raw/DatasetB_ValidationClasses.joblib.pkl'))
            return variables
        elif dataSetType == "C_train":
            variables = numpy.array(
                joblib.load('datasetC_raw/DatasetC_TrainClasses.joblib.pkl'))
            return variables
        elif dataSetType == "C_test":
            variables = numpy.array(
                joblib.load(
                    'datasetC_raw/DatasetC_ValidationClasses.joblib.pkl'))
            return variables

        else:
            print "INVALID INPUT"
            logWarning(
                "HARD CODED VALUE from DataSetLoaderLib.LoadDataSetClasses()")
            return [0, 1, 1, 1, 0, 1]
Пример #3
0
    def CalculateSimilarity(self, theMatrix, aPart, cacheTopXPerPart):
        theMatrixTranspose = theMatrix.transpose()
        #convert from 88x1M to 1Mx88 matrix
        #we will be saving the corr matrix by parts in different files and use files for processing
        totalParts = theMatrix.shape[1] / aPart
        totalParts = totalParts if theMatrix.shape[
            1] % aPart == 0 else totalParts + 1
        logInfo('going to run for totalParts = ' + str(totalParts) + " i.e. " +
                str(theMatrix.shape[1]) + " / " + str(aPart))
        logDebug("theMatrix.size = " + str(theMatrix.shape))
        globalHash = {}
        settings = Settings()
        #worked till 32000 but to make it completely divisible i did 1004004/36=27889 there is a hard limit on 2^27 processing at a time http://stackoverflow.com/questions/13187443/nvidia-cufft-limit-on-sizes-and-batches-for-fft-with-scikits-cuda
        #will need to calculate it using in a loop of 32 OR launch such 32 warps

        vectorCache = []
        #numpy.array([], dtype=[('i', int), ('j', int), ('corr', float)]);#(-1,-1,0.0)
        for i in range(0, totalParts - 1):

            print("corr calculator i=" + str(i))
            for j in range(i, totalParts - 1):
                logDebug("corr calculator j=" + str(j) + " out of " +
                         str(totalParts - 1))
                A = theMatrixTranspose[i * aPart:(i + 1) * aPart, :].tolist()
                ## Last part should not take full space if it is not required to do so
                B = theMatrixTranspose[j * aPart:(j + 1) * aPart, :].tolist()
                #logDebug("A = " + str(len(A)));
                #logDebug("B = " + str(len(B)));
                if settings.isLocalMachine():
                    Result = A[:][0:1000]
                else:
                    Result = pearson_correlation(A, B)
                logInfo('going to concatenate vectorCache with the main list')
                newlist, localHash = self.ExtractTopCorrValues(
                    Result, cacheTopXPerPart, aPart, globalHash, i, j)
                globalHash.update(localHash)
                vectorCache.extend(newlist)
                #http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
                logInfo('concatenation done...')

        #return sorted list
        logInfo('going to sort... ')
        #logDebug(vectorCache);
        vectorCache.sort(key=lambda x: x[2], reverse=True)
        #axis=0, kind='quicksort', has 4 things (Fa, Fb, Corr, '')
        logInfo('Calculate Similarity done. ' + str(len(vectorCache)))
        return vectorCache
Пример #4
0
class DatasetLoader(object):
    s = Settings();
    __fileBasePath = s.getBasePath();
    def LoadDataSet(self, dataSetType):
        if dataSetType == "A":
            mat = scipy.io.loadmat(__fileBasePath + s.getInterimPath() + s.getDatasetAFileName());
            return mat['G0'];
        else:
            return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]];
Пример #5
0
 def LoadEnhancedDataSet(self, dataSetType):
     s = Settings()
     if dataSetType == "A":
         with open('objs.pickle.backup'
                   ) as f:  #the file will need to be f.pickle
             return pickle.load(f)[1]
             #this will need to be changed to [0]
     else:
         logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()")
         return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],
                 [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    def CalculateSimilarity(self, theMatrix, aPart, cacheTopXPerPart):
        theMatrixTranspose = theMatrix.transpose();#convert from 88x1M to 1Mx88 matrix
        #we will be saving the corr matrix by parts in different files and use files for processing
        totalParts = theMatrix.shape[1] / aPart;
        totalParts = totalParts if theMatrix.shape[1] % aPart ==0 else totalParts + 1;
        logInfo('going to run for totalParts = ' + str(totalParts) + " i.e. " + str(theMatrix.shape[1]) + " / " + str(aPart));
        logDebug("theMatrix.size = " + str(theMatrix.shape));
        globalHash = {};
        settings = Settings();
        #worked till 32000 but to make it completely divisible i did 1004004/36=27889 there is a hard limit on 2^27 processing at a time http://stackoverflow.com/questions/13187443/nvidia-cufft-limit-on-sizes-and-batches-for-fft-with-scikits-cuda
        #will need to calculate it using in a loop of 32 OR launch such 32 warps
        
        vectorCache = [];#numpy.array([], dtype=[('i', int), ('j', int), ('corr', float)]);#(-1,-1,0.0)
        for i in range (0, totalParts-1):
            print("i="+str(i));
            for j in range(i, totalParts-1):
                logDebug("j=" + str(j));
                A = theMatrixTranspose[i*aPart:(i+1)*aPart,:].tolist();
                ## Last part should not take full space if it is not required to do so
                B = theMatrixTranspose[j*aPart:(j+1)*aPart,:].tolist();
                logDebug("A = " + str(len(A)));
                logDebug("B = " + str(len(B)));
                if settings.isLocalMachine():
                    Result = A[:][0:1000];
                else:
                    Result = pearson_correlation(A, B)
                logInfo('going to concatenate vectorCache with the main list');
                newlist, localHash = self.ExtractTopCorrValues(Result, cacheTopXPerPart, aPart, globalHash, i, j);
                globalHash.update(localHash);
                vectorCache.extend(newlist);#http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
                logInfo('concatenation done...');

        #return sorted list
        logInfo('going to sort... ');
        logDebug(vectorCache);
        vectorCache.sort(key=lambda x: x[2], reverse=True);#axis=0, kind='quicksort', has 4 things (Fa, Fb, Corr, '')
        logInfo('Calculate Similarity done. ' + str(len(vectorCache)));
        return vectorCache;
Пример #7
0
    def LoadDataSet(self, dataSetType):
        s = Settings()
        if dataSetType == "A":
            variables = numpy.array(
                joblib.load('DatasetA_Validation.joblib.pkl'))
            return variables
        #mat = scipy.io.loadmat(s.getBasePath() + s.getInterimPath() + s.getDatasetAFileName());
        #return mat['G0'][:, 0:s.sampleSize()];
        elif dataSetType == "B_train":
            variables = numpy.array(
                joblib.load('DataSetBGSE24417MAQCIITraining_data.joblib.pkl'))
            return variables
        elif dataSetType == "B_test":
            variables = numpy.array(
                joblib.load(
                    'DataSetBGSE24417MAQCIIValidation_data.joblib.pkl'))
            return variables

        else:
            print "INVALID INPUT"
            logWarning("HARD CODED VALUE from DataSetLoaderLib.LoadDataSet()")
            return [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],
                    [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
Пример #8
0
from GlobalUtils import * 
import numpy
from LoadDataSetA import LoadDataSet
from MachineSpecificSettings import Settings
s = Settings()
p=20
w=2;
G = numpy.matrix((88,p), dtype=float)
Result = numpy.zeros([p,p], dtype=float)
logDebug ("going to start the loops now")
LoadDataSet()
for i in range (0, p/w):
    print("i=")
    print(i)
    for j in range(i, p/w):
        print("j=")
        print(j)
        A = G[:, i*w:(i+1)*w]
        B = G[:, j*w:(j+1)*w]
        if(i==j):
            R1 = numpy.corrcoef(A, B)
            Result[i*w:(i+1)*w, j*w:(j+1)*w] = R1
        else:
            Result[j*w:(j+1)*w, i*w:(i+1)*w] = R1.transpose()