Пример #1
0
    def __init__(self, YList, X, featuresName, ages, args):
        super(MetabolomicsExpRunner, self).__init__(args=args)
        self.X = X
        self.YList = YList #The list of concentrations 
        self.featuresName = featuresName
        self.args = args
        self.ages = ages 

        self.maxDepth = 10
        self.numTrees = 10
        self.sampleSize = 1.0
        self.sampleReplace = True
        self.folds = 5
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

        self.leafRankGenerators = []
        self.leafRankGenerators.append((LinearSvmGS.generate(), "SVM"))
        self.leafRankGenerators.append((SvcGS.generate(), "RBF-SVM"))
        self.leafRankGenerators.append((DecisionTree.generate(), "CART"))

        self.pcaLeafRankGenerators = [(LinearSvmPca.generate(), "LinearSVM-PCA")]

        self.funcLeafRankGenerators = []
        self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF"))
        self.funcLeafRankGenerators.append((SvcFGs.generate, "RBF-SVMF"))
        self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF"))

        #Store all the label vectors and their missing values
        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)
        self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds]
        self.hormoneNames = MetabolomicsUtils.getLabelNames()
Пример #2
0
    def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex):
        #Some code to do ranking using the learner predictors
        i = 0
        rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1))
        for idxtr, idxts in indexList:
            logging.info("Iteration " + str(i))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = Y[idxtr], Y[idxts]

            bestLearners[i].learnModel(trainX, trainY)
            predY = bestLearners[i].predict(testX)
            gc.collect()

            #Now output 3 sets of ranked scores
            predY = standardiserY.unstandardiseArray(predY)
            testY = standardiserY.unstandardiseArray(testY)

            YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
            YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

            for j in range(self.boundsList[labelIndex].shape[0]-1):
                rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j])
            i += 1

        logging.debug(rankMetrics)

        return rankMetrics
Пример #3
0
    def __init__(self, YList, X, featuresName, ages, args):
        super(MetabolomicsExpRunner, self).__init__(args=args)
        self.X = X
        self.YList = YList #The list of concentrations
        self.featuresName = featuresName
        self.args = args
        self.ages = ages

        self.maxDepth = 5
        self.numTrees = 10
        self.folds = 3
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

        self.leafRankGenerators = []
        #self.leafRankGenerators.append((SvcGS.generate(), "SVC"))
        #self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM"))
        self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA"))

        self.funcLeafRankGenerators = []
        #self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF"))
        #self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF"))
        self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF"))

        #Store all the label vectors and their missing values
        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)
        self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds]
        self.hormoneNames = MetabolomicsUtils.getLabelNames()
Пример #4
0
    def __init__(self, df, X, featuresName, ages, args):
        super(MetabolomicsRegExpRunner, self).__init__(args=args)
        self.df = df
        self.X = X
        self.featuresName = featuresName
        self.args = args
        self.ages = ages 

        self.labelNames = MetabolomicsUtils.getLabelNames()
        self.YList = MetabolomicsUtils.createLabelList(df, self.labelNames)
        self.boundsList = MetabolomicsUtils.getBounds()

        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"
Пример #5
0
    def meanAUC(self, predY, testY, labelIndex, standardiserY):
        predY = standardiserY.unstandardiseArray(predY)
        testY = standardiserY.unstandardiseArray(testY)

        YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
        YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

        rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1)

        for j in range(rankMetrics.shape[0]):
            rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j])

        return numpy.mean(rankMetrics)
Пример #6
0
    def testReconstructSignal(self):
        numExamples = 100 
        numFeatures = 16 
        X = numpy.random.rand(numExamples, numFeatures)

        level = 10 
        mode = "cpd"
        waveletStr = "db4"
        C = pywt.wavedec(X[0, :], waveletStr, mode, level=10)

        Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
        X2 = MetabolomicsUtils.reconstructSignal(X, Xw, waveletStr, mode, C)

        tol = 10**-6 
        self.assertTrue(numpy.linalg.norm(X - X2) < tol)
Пример #7
0
    def testCreateIndicatorLabels(self):
        numpy.set_printoptions(threshold=3000)
        X, X2, Xs, Xopls, YList, df = MetabolomicsUtils.loadData()

        #YList = MetabolomicsUtils.createLabelList(df, MetabolomicsUtils.getLabelNames())

        Y1, inds1 = YList[0]
        Y2, inds2 = YList[1]
        Y3, inds3 = YList[2]

        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)

        s = YIgf1Inds[0] + YIgf1Inds[1] + YIgf1Inds[2]
        self.assertTrue((s == numpy.ones(s.shape[0])).all())

        s = YICortisolInds[0] + YICortisolInds[1] + YICortisolInds[2]
        self.assertTrue((s == numpy.ones(s.shape[0])).all())

        s = YTestoInds[0] + YTestoInds[1] + YTestoInds[2]
        self.assertTrue((s == numpy.ones(s.shape[0])).all())

        #Now compare to those labels in the file
        labelNames = ["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]
        labelNames.extend(["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"])
        labelNames.extend(["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"])

        Y = numpy.array(df.rx(labelNames[6])).ravel()[inds1]
        logging.debug(numpy.sum(numpy.abs(YIgf1Inds[0] - Y)))
        Y = numpy.array(df.rx(labelNames[7])).ravel()[inds1]
        logging.debug(numpy.sum(numpy.abs(YIgf1Inds[1] - Y)))
        Y = numpy.array(df.rx(labelNames[8])).ravel()[inds1]
        logging.debug(numpy.sum(numpy.abs(YIgf1Inds[2] - Y)))

        Y = numpy.array(df.rx(labelNames[3])).ravel()[inds2]
        logging.debug(numpy.sum(numpy.abs(YICortisolInds[0] - Y)))
        Y = numpy.array(df.rx(labelNames[4])).ravel()[inds2]
        logging.debug(numpy.sum(numpy.abs(YICortisolInds[1] - Y)))
        Y = numpy.array(df.rx(labelNames[5])).ravel()[inds2]
        logging.debug(numpy.sum(numpy.abs(YICortisolInds[2] - Y)))

        Y = numpy.array(df.rx(labelNames[0])).ravel()[inds3]
        logging.debug(numpy.sum(numpy.abs(YTestoInds[0] - Y)))
        Y = numpy.array(df.rx(labelNames[1])).ravel()[inds3]
        logging.debug(numpy.sum(numpy.abs(YTestoInds[1] - Y)))
        Y = numpy.array(df.rx(labelNames[2])).ravel()[inds3]
        logging.debug(numpy.sum(numpy.abs(YTestoInds[2] - Y)))
Пример #8
0
    def testGetWaveletFeaturesTest(self):
        #See if we can reproduce the data from the wavelet 

        X, X2, Xs, Xopls, YList, df = MetabolomicsUtils.loadData()

        waveletStr = 'db4'
        mode = "zpd"
        level = 10
        C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode)
        X0 = pywt.waverec(C, waveletStr, mode)
        tol = 10**-6
        self.assertTrue(numpy.linalg.norm(X0 - X[0, :]) < tol)

        def reconstructSignal(X, Xw, waveletStr, level, mode, C):
            Xrecstr = numpy.zeros(X.shape)
            
            for i in range(Xw.shape[0]):
                C2 = []

                colIndex = 0
                for j in range(len(list(C))):
                    C2.append(Xw[i, colIndex:colIndex+len(C[j])])
                    colIndex += len(C[j])

                Xrecstr[i, :] = pywt.waverec(tuple(C2), waveletStr, mode)

            return Xrecstr

        #Now do the same for the whole X
        C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode)
        Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
        Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C)
        self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol)

        waveletStr = 'db8'
        C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode)
        Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
        Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C)
        self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol)

        waveletStr = 'haar'
        C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode)
        Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
        Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C)
        self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol)
Пример #9
0
    def testFilterWavelet(self):
        numExamples = 100
        numFeatures = 16
        X = numpy.random.rand(numExamples, numFeatures)

        level = 10
        mode = "cpd"
        waveletStr = "db4"
        C = pywt.wavedec(X[0, :], waveletStr, mode, level=10)

        Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
        
        N = 10
        Xw2, inds = MetabolomicsUtils.filterWavelet(Xw, N)

        tol = 10**-6 
        self.assertEquals(inds.shape[0], N)
        self.assertTrue(numpy.linalg.norm( Xw[:, inds] - Xw2[:, inds] ) < tol)

        zeroInds = numpy.setdiff1d(numpy.arange(Xw.shape[1]), inds)
        self.assertTrue(numpy.linalg.norm(Xw2[:, zeroInds]) < tol)
Пример #10
0
    def testScoreLabel(self):#
        numExamples = 10 
        Y = numpy.random.rand(numExamples)

        bounds = numpy.array([0, 0.2, 0.8, 1.0])

        YScores = MetabolomicsUtils.scoreLabels(Y, bounds)

        inds1 = numpy.argsort(Y)
        inds2 = numpy.argsort(YScores[:, 0])
        inds3 = numpy.argsort(YScores[:, -1])

        inds4 = numpy.argsort(numpy.abs(Y - 0.5))
        inds5 = numpy.argsort(YScores[:, 1])

        self.assertTrue((inds1 == inds3).all())
        self.assertTrue((inds1 == numpy.flipud(inds2)).all())
        self.assertTrue((inds4 == numpy.flipud(inds5)).all())

        #Test we don't get problems when Y has the same values
        Y = numpy.ones(numExamples)
        YScores = MetabolomicsUtils.scoreLabels(Y, bounds)

        self.assertTrue((YScores == numpy.ones((Y.shape[0], 3))).all())
Пример #11
0
        self.saveResults(self.leafRankGenerators, True)

    def run2(self):
        logging.debug('module name:' + __name__)
        logging.debug('parent process:' +  str(os.getppid()))
        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.funcLeafRankGenerators, False)

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = 'db4'
mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode)

dataList = []
dataList.extend([(XwDb4, "db4")])

lock = multiprocessing.Lock()

numpy.random.seed(datetime.datetime.now().microsecond)
#numpy.random.seed(21)
Пример #12
0
        self.saveResults(self.funcLeafRankGenerators, "func")

    def runPCA(self):
        logging.debug('module name:' + __name__)
        logging.debug('parent process:' +  str(os.getppid()))
        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.pcaLeafRankGenerators, "pca")

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode)

#Filter the wavelets
Ns = [10, 25, 50, 75, 100]
dataList = []

for i in range(len(Ns)):
    N = Ns[i]
    XwDb4F, inds = MetabolomicsUtils.filterWavelet(XwDb4, N)
    dataList.append((XwDb4F[:, inds], "Db4-" + str(N)))
Пример #13
0
from rpy2.robjects.packages import importr
from socket import gethostname
import matplotlib.pyplot as plt
from apgl.data.Standardiser import Standardiser

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)
numpy.set_printoptions(linewidth=160, precision=3, suppress=True)

treeRankLib = importr('TreeRank')
baseLib = importr('base')
baseLib.options(warn=1)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)

mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode)

#Plot the correlation of the raw spectrum above x percent
Xr = numpy.random.rand(Xs.shape[0], Xs.shape[1])
datasets = [(Xr, "random"), (Xs, "raw"), (XwHaar, "haar"), (XwDb4, "db4"), (XwDb8, "db8")]

corLims = numpy.arange(0, 1.01, 0.01)
Пример #14
0
from apgl.util.PathDefaults import PathDefaults
from exp.metabolomics.MetabolomicsUtils import MetabolomicsUtils
import numpy
import pywt 


dataDir = PathDefaults.getDataDir() + "functional/"
fileName = dataDir + "synthetic_control.data"

X = numpy.loadtxt(fileName)

#Ignore first 200 examples
X = X[200:, :]
Y = numpy.zeros(X.shape[0])
Y[0:200] = -1 #Increading trend and decreasing trend
Y[200:] = 1 #Upward shift and downward shift

#Compute wavelets

waveletStr = "db2"
level = 2
mode = "cpd"
Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)

print(X.shape)
print(Xw.shape)

C = pywt.wavedec(X[0, :], waveletStr, mode, level)

for c in C:
    print(c.shape)
Пример #15
0
        self.saveResults(self.leafRankGenerators, True)

    def run2(self):
        logging.debug("module name:" + __name__)
        logging.debug("parent process:" + str(os.getppid()))
        logging.debug("process id:" + str(os.getpid()))

        self.saveResults(self.funcLeafRankGenerators, False)


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)

dataDir = PathDefaults.getDataDir() + "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = "db4"
mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, "db4", level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, "db8", level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, "haar", level, mode)

dataList = []
dataList.extend([(XwDb4, "db4")])

lock = multiprocessing.Lock()

numpy.random.seed(datetime.datetime.now().microsecond)
# numpy.random.seed(21)
Пример #16
0
import numpy
from exp.metabolomics.MetabolomicsUtils import MetabolomicsUtils

X, X2, df = MetabolomicsUtils.loadData()

#Just figure out the boundaries of the levels 
numpy.set_printoptions(threshold=3000)
labelNames = ["IGF1.val", "Cortisol.val", "Testosterone.val"]
labelNames2 = ["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]
YList = MetabolomicsUtils.createLabelList(df, labelNames)
YList2 = MetabolomicsUtils.createLabelList(df, labelNames2)

Y, inds = YList[0]
Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds]
Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds]
Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds]

inds = numpy.argsort(Y)
YY = numpy.c_[Y[inds], Y1[inds]]
YY = numpy.c_[YY, Y2[inds]]
YY = numpy.c_[YY, Y3[inds]]
print(YY)

labelNames2 = ["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]
YList2 = MetabolomicsUtils.createLabelList(df, labelNames2)

Y, inds = YList[1]
Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds]
Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds]
Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds]
Пример #17
0
from rpy2.robjects.packages import importr
from socket import gethostname
import matplotlib.pyplot as plt
from apgl.data.Standardiser import Standardiser

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)
numpy.set_printoptions(linewidth=160, precision=3, suppress=True)

treeRankLib = importr('TreeRank')
baseLib = importr('base')
baseLib.options(warn=1)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = 'db4'
mode = "cpd"
maxLevel = 10
errors = numpy.zeros(maxLevel)
numFeatures = numpy.zeros(maxLevel)

level = 10 
waveletStrs = ["haar", "db4", "db8"]

#The variances are very similar across different wavelets 
for waveletStr in waveletStrs:
    Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
    standardiser = Standardiser()
    Xw = standardiser.centreArray(Xw)