예제 #1
0
 def __loadLeafRanks(self):
     utilFileName = PathDefaults.getSourceDir(
     ) + "/apgl/metabolomics/R/Util.R"
     leafRanksFileName = PathDefaults.getSourceDir(
     ) + "/apgl/metabolomics/R/MSLeafRanks.R"
     robjects.r["source"](utilFileName)
     robjects.r["source"](leafRanksFileName)
예제 #2
0
    def __init__(self, field):
        numpy.random.seed(21)        
        
        dataDir = PathDefaults.getDataDir() + "dblp/"
        self.xmlFileName = dataDir + "dblp.xml"
        self.xmlCleanFilename = dataDir + "dblpClean.xml"        

        resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/"
        self.expertsFileName = resultsDir + "experts.txt"
        self.expertMatchesFilename = resultsDir + "experts_matches.csv"
        self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv"
        self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv"
        self.coauthorsFilename = resultsDir + "coauthors.csv"
        self.publicationsFilename = resultsDir + "publications.csv"
        
        self.stepSize = 100000
        self.numLines = 33532888
        self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"])
        self.p = 0.5     
        self.matchCutoff = 0.95
        
        
        self.cleanXML()
        self.matchExperts()
        logging.warning("Now you must disambiguate the matched experts if not ready done")        
예제 #3
0
    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
            
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = 1286229600
            
        self.timeStep = timedelta(30).total_seconds()             
                
        self.ratingFileName = outputDir + "data.npz"          
        self.userDictFileName = outputDir + "userIdDict.pkl"   
        self.groupDictFileName = outputDir + "groupIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
    
        self.dataDir = PathDefaults.getDataDir() + "erasm/"
        self.dataFileName = self.dataDir + "groupMembers-29-11-12" 
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 
        
        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ext
    
    print("Loading data from file " + fileName)
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/"

    XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters)
    X = XY[:, :-1]
    y = XY[:, -1]
    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)
    preprocessSave(X, y, outputDir, idx)
예제 #5
0
    def testGenerateRandomGraph(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1

        
        p = 0.1
        neighbours = 10
        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)

        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
예제 #6
0
    def __init__(self, YList, X, featuresName, ages, args):
        super(MetabolomicsExpRunner, self).__init__(args=args)
        self.X = X
        self.YList = YList  # The list of concentrations
        self.featuresName = featuresName
        self.args = args
        self.ages = ages

        self.maxDepth = 5
        self.numTrees = 10
        self.folds = 3
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

        self.leafRankGenerators = []
        # self.leafRankGenerators.append((SvcGS.generate(), "SVC"))
        # self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM"))
        self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA"))

        self.funcLeafRankGenerators = []
        # self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF"))
        # self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF"))
        self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF"))

        # Store all the label vectors and their missing values
        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)
        self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds]
        self.hormoneNames = MetabolomicsUtils.getLabelNames()
예제 #7
0
 def getLsos(self):
     """
     Return a function to display R memory usage
     """
     fileName = PathDefaults.getSourceDir() + "/apgl/metabolomics/R/Util.R"
     robjects.r["source"](fileName)
     return robjects.r['lsos']
    def testLoadParams(self):
        try:
            lmbda = 0.01

            alterRegressor = PrimalRidgeRegression(lmbda)
            egoRegressor = PrimalRidgeRegression(lmbda)
            predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor)

            params = [0.1, 0.2]
            paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda]
            fileName = PathDefaults.getTempDir() + "tempParams.pkl"

            predictor.saveParams(params, paramFuncs, fileName)

            params2 = predictor.loadParams(fileName)

            self.assertTrue(
                params2[0][0] == "apgl.predictors.PrimalRidgeRegression")
            self.assertTrue(params2[0][1] == "setLambda")
            self.assertTrue(params2[0][2] == 0.1)

            self.assertTrue(
                params2[1][0] == "apgl.predictors.PrimalRidgeRegression")
            self.assertTrue(params2[1][1] == "setLambda")
            self.assertTrue(params2[1][2] == 0.2)
        except IOError as e:
            logging.warn(e)
예제 #9
0
 def __init__(self):
     self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"]
     self.dataDir = PathDefaults.getDataDir() +  "metabolomic/"
     self.boundsDict = {}
     self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573])
     self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13])
     self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
예제 #10
0
    def testEstimate(self):
        #Lets set up a simple model based on normal dist
        abcParams = ABCParameters()

        epsilonArray = numpy.array([0.5, 0.2, 0.1])
        posteriorSampleSize = 20

        #Lets get an empirical estimate of Sprime
        model = NormalModel(abcMetrics)
        model.setMu(theta[0])
        model.setSigma(theta[1])

        Sprime = abcMetrics.summary(model.simulate())
        logging.debug(("Real summary statistic: " + str(Sprime)))

        thetaDir = PathDefaults.getTempDir()

        abcSMC = ABCSMC(epsilonArray, createNormalModel, abcParams, thetaDir)
        abcSMC.maxRuns = 100000
        abcSMC.setPosteriorSampleSize(posteriorSampleSize)
        thetasArray = abcSMC.run()
        thetasArray = numpy.array(thetasArray)

        meanTheta = numpy.mean(thetasArray, 0)
        logging.debug((thetasArray.shape))
        logging.debug(thetasArray)
        logging.debug(meanTheta)

        print(thetasArray.shape[0], posteriorSampleSize)

        #Note only mean needs to be similar
        self.assertTrue(thetasArray.shape[0] >= posteriorSampleSize)
        self.assertEquals(thetasArray.shape[1], 2)
        self.assertTrue(numpy.linalg.norm(theta[0] - meanTheta[0]) < 0.2)
    def testLoadParams(self):
        try:
            lmbda = 0.01

            alterRegressor = PrimalRidgeRegression(lmbda)
            egoRegressor = PrimalRidgeRegression(lmbda)
            predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor)

            params = [0.1, 0.2]
            paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda]
            fileName = PathDefaults.getTempDir() + "tempParams.pkl"

            predictor.saveParams(params, paramFuncs, fileName)

            params2 = predictor.loadParams(fileName)

            self.assertTrue(params2[0][0] == "apgl.predictors.PrimalRidgeRegression")
            self.assertTrue(params2[0][1] == "setLambda")
            self.assertTrue(params2[0][2] == 0.1)

            self.assertTrue(params2[1][0] == "apgl.predictors.PrimalRidgeRegression")
            self.assertTrue(params2[1][1] == "setLambda")
            self.assertTrue(params2[1][2] == 0.2)
        except IOError as e:
            logging.warn(e)
    def testEdgeFile(self):
        """
        Figure out the problem with the edge file 
        """
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"

        edges = {}
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        vertices = {}

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges[(vertex1, vertex2)] = 0
            vertices[vertex1] = 0
            vertices[vertex2] = 0

        #It says there are 352807 edges in paper and 27770 vertices
        self.assertEquals(len(edges), 352807)
        self.assertEquals(len(vertices), 27770)
예제 #13
0
    def testPredict2(self):
        # Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array(
            [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]
        )
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0

        # The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i += 1
예제 #14
0
 def getLsos(self):
     """
     Return a function to display R memory usage
     """
     fileName = PathDefaults.getSourceDir() + "/apgl/metabolomics/R/Util.R"
     robjects.r["source"](fileName)
     return robjects.r['lsos']
예제 #15
0
    def __init__(self, YList, X, featuresName, ages, args):
        super(MetabolomicsExpRunner, self).__init__(args=args)
        self.X = X
        self.YList = YList #The list of concentrations
        self.featuresName = featuresName
        self.args = args
        self.ages = ages

        self.maxDepth = 5
        self.numTrees = 10
        self.folds = 3
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

        self.leafRankGenerators = []
        #self.leafRankGenerators.append((SvcGS.generate(), "SVC"))
        #self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM"))
        self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA"))

        self.funcLeafRankGenerators = []
        #self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF"))
        #self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF"))
        self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF"))

        #Store all the label vectors and their missing values
        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)
        self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds]
        self.hormoneNames = MetabolomicsUtils.getLabelNames()
예제 #16
0
 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X 
예제 #17
0
    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
예제 #18
0
    def testToyData(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]


        pxSum = 0
        pY1XSum = 0
        pYminus1XSum = 0

        px2Sum = 0 
        squareArea = (gridPoints[1]-gridPoints[0])**2

        for i in range(gridPoints.shape[0]-1):
            for j in range(gridPoints.shape[0]-1):
                px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4
                pxSum += px*squareArea

                pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4
                pY1XSum += pY1X*squareArea

                pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4
                pYminus1XSum += pYminus1X*squareArea

                px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea

        self.assertAlmostEquals(pxSum, 1)
        print(pY1XSum)
        print(pYminus1XSum)

        self.assertAlmostEquals(px2Sum, 1)
예제 #19
0
    def testComputeIdealPenalty(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        fullX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        C = 1.0
        gamma = 1.0
        args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X)
        penalty = computeIdealPenalty(args)


        #Now compute penalty using data
        args = (trainX, trainY, testX, testY, C, gamma)
        penalty2 = computeIdealPenalty2(args)

        self.assertAlmostEquals(penalty2, penalty, 2)
예제 #20
0
    def testEstimate(self):
        #Lets set up a simple model based on normal dist
        abcParams = ABCParameters()
        
        epsilonArray = numpy.array([0.5, 0.2, 0.1])
        posteriorSampleSize = 20

        #Lets get an empirical estimate of Sprime
        model = NormalModel(abcMetrics)
        model.setMu(theta[0])
        model.setSigma(theta[1])
        
        Sprime = abcMetrics.summary(model.simulate()) 
        logging.debug(("Real summary statistic: " + str(Sprime)))

        thetaDir = PathDefaults.getTempDir()
        
        abcSMC = ABCSMC(epsilonArray, createNormalModel, abcParams, thetaDir)
        abcSMC.maxRuns = 100000
        abcSMC.setPosteriorSampleSize(posteriorSampleSize)
        thetasArray = abcSMC.run()
        thetasArray = numpy.array(thetasArray)

        meanTheta = numpy.mean(thetasArray, 0)
        logging.debug((thetasArray.shape))
        logging.debug(thetasArray)
        logging.debug(meanTheta)

        print(thetasArray.shape[0], posteriorSampleSize)

        #Note only mean needs to be similar
        self.assertTrue(thetasArray.shape[0] >= posteriorSampleSize)
        self.assertEquals(thetasArray.shape[1], 2)
        self.assertTrue(numpy.linalg.norm(theta[0] - meanTheta[0]) < 0.2)
예제 #21
0
    def __init__(self, cmdLine=None, defaultAlgoArgs = None, dirName=""):
        """ priority for default args
         - best priority: command-line value
         - middle priority: set-by-function value
         - lower priority: class value
        """
        # Parameters to choose which methods to run
        # Obtained merging default parameters from the class with those from the user
        self.algoArgs = RankingExpHelper.newAlgoParams(defaultAlgoArgs)


        self.ps = [1, 3, 5]

        #The max number of observations to use for model selection
        self.sampleSize = 5*10**6

        # basic resultsDir
        self.resultsDir = PathDefaults.getOutputDir() + "ranking/" + dirName + "/"

        #Create the results dir if it does not exist
        #    os.makedirs(resultsDir, exist_ok=True) # for python 3.2
        try:
            os.makedirs(self.resultsDir)
        except OSError as err:
            if err.errno != errno.EEXIST:
                raise

        # update algoParams from command line
        self.readAlgoParams(cmdLine)

        #Sometimes there are problems with multiprocessing, so this fixes the issues
        os.system('taskset -p 0xffffffff %d' % os.getpid())
예제 #22
0
 def main(argv=None):
     if argv is None:
         argv = sys.argv
     try:
         # read options
         try:
             opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"])
         except getopt.error as msg:
              raise RGUsage(msg)
         # apply options
         dir = PathDefaults.getDataDir() + "cluster/"
         nb_user = None
         log_level = logging.INFO
         for o, a in opts:
             if o in ("-h", "--help"):
                 print(__doc__)
                 return 0
             elif o in ("-d", "--dir"):
                 dir = a
             elif o in ("-n", "--nb_user"):
                 nb_user = int(a)
             elif o in ("-D", "--debug"):
                 log_level = logging.DEBUG
         logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s')
         # process: generate data files
         BemolData.generate_data_file(dir, nb_user)
     except RGUsage as err:
         logging.error(err.msg)
         logging.error("for help use --help")
         return 2
예제 #23
0
 def getIterator(): 
     dataDir = PathDefaults.getDataDir() + "cluster/"
     
     nbUser = 10000 # set to 'None' to have all users
     nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date
     startingIteration = 300
     endingIteration = 600 # set to 'None' to have all iterations
     stepSize = 1    
     
     return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
 def profileClusterFromIterator(self):
     iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList)
     dataDir = PathDefaults.getDataDir() + "cluster/"
     #iterator = getBemolGraphIterator(dataDir)
     
     def run(): 
         clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True)
         print(timeList.cumsum(0))
         
     ProfileUtils.profile('run()', globals(), locals())
예제 #25
0
 def syntheticDataset2(): 
     """
     Create a simple synthetic dataset using a power law distribution on users and items 
     """
     resultsDir = PathDefaults.getDataDir() + "syntheticRanking/"
     matrixFileName = resultsDir + "dataset1.mtx" 
     
     X = sppy.io.mmread(matrixFileName, storagetype="row")
     
     return X   
예제 #26
0
    def testGetTrainIteratorFunc(self):
        dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" 
        dataset = Static2IdValDataset(dataFilename)

        trainIterator = dataset.getTrainIteratorFunc()()      
        testIterator = dataset.getTestIteratorFunc()()
        
        for trainX in trainIterator: 
            testX = testIterator.next() 
            
            print(trainX.shape, trainX.nnz, testX.nnz)
            self.assertEquals(trainX.shape, testX.shape)
예제 #27
0
 def mendeley2(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Document"):
     authorAuthorFileName = PathDefaults.getDataDir() + "reference/author" + dataset + "Matrix.mtx"
     logging.debug("Reading file: " + authorAuthorFileName)
     X = sppy.io.mmread(authorAuthorFileName, storagetype="row")
             
     logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + authorAuthorFileName)
     logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     return X 
def processParkinsonsDataset(name, numRealisations):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ".data"
    

    XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1)
    inds = list(set(range(XY.shape[1])) - set([5, 6]))
    X = XY[:, inds]

    y1 = XY[:, 5]
    y2 = XY[:, 6]
    #We don't keep whole collections of patients
    split = 0.5

    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)

    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/"
    preprocessSave(X, y1, outputDir, idx)
    
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/"
    preprocessSave(X, y2, outputDir, idx)
예제 #29
0
    def profileSvd2(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = RandomisedSVD.svd(trainX, k)

        print(s)

        print("All done")
예제 #30
0
 def profileSvd2(self):
     dataDir = PathDefaults.getDataDir() + "erasm/contacts/" 
     trainFilename = dataDir + "contacts_train"        
     
     trainX = scipy.io.mmread(trainFilename)
     trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)
     
     k = 500 
     U, s, V = RandomisedSVD.svd(trainX, k)
     
     print(s)
     
     print("All done")
예제 #31
0
    def profileClusterFromIterator(self):
        iterator = IncreasingSubgraphListIterator(self.graph,
                                                  self.subgraphIndicesList)
        dataDir = PathDefaults.getDataDir() + "cluster/"

        #iterator = getBemolGraphIterator(dataDir)

        def run():
            clusterList, timeList, boundList = self.clusterer.clusterFromIterator(
                iterator, verbose=True)
            print(timeList.cumsum(0))

        ProfileUtils.profile('run()', globals(), locals())
예제 #32
0
    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()
예제 #33
0
    def profilePropackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5)

        print(s)

        #Memory consumption is dependent on kmax
        print("All done")
예제 #34
0
def loadParams(ind): 
    if processReal: 
        resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(ind) + "/"
        outputDir = resultsDir + "stats/"
        
        N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(True)
        startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(test=True, ind=ind)
        realTheta, sigmaTheta, pertTheta = HIVModelUtils.estimatedRealTheta(ind)
        numInds=2
        prefix = "Real"
    else: 
        resultsDir = PathDefaults.getOutputDir() + "viroscopy/toy/theta/"
        outputDir = resultsDir + "stats/"        
        
        N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.toyABCParams()
        startDate, endDate, recordStep, M, targetGraph = HIVModelUtils.toySimulationParams(test=True)
        realTheta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta()
        prefix = "Toy"
        numInds = 1

    breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size)  * breakScale       
        
    return N, resultsDir, outputDir, recordStep, startDate, endDate, prefix, targetGraph, breakSize, numEpsilons, M, matchAlpha, matchAlg, numInds
예제 #35
0
    def profilePropackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5)

        print(s)

        # Memory consumption is dependent on kmax
        print("All done")
예제 #36
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName,
                           skiprows=1,
                           usecols=(1, 2, 3),
                           delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2] * 2 - 1

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName,
                               skiprows=1,
                               usecols=(1, 2, 3),
                               delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2] * 2 - 1

        #X = Standardiser().standardiseArray(X)
        #testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([
            0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508,
            0.7367508, 0.7367508
        ])
        testAucs = numpy.array([
            0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400,
            0.6874400, 0.6874400
        ])
        i = 0

        #The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(DecisionTree)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            #print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            #self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            #self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i += 1

        #Compare tree to that of R version
        tree = treeRank.getTree()
    def testSaveParams(self):
        try:
            lmbda = 0.01

            alterRegressor = PrimalRidgeRegression(lmbda)
            egoRegressor = PrimalRidgeRegression(lmbda)
            predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor)

            params = [0.1, 0.2]
            paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda]
            fileName = PathDefaults.getTempDir() + "tempParams.pkl"

            predictor.saveParams(params, paramFuncs, fileName)
        except IOError as e:
            logging.warn(e)
예제 #38
0
    def profileArpackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32)
        print(trainX.dtype.char, trainX.dtype)

        k = 500
        U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5)

        print(s)

        # Memory consumption is dependent on kmax and less than PROPACK
        print("All done")
예제 #39
0
    def profileArpackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32)
        print(trainX.dtype.char, trainX.dtype)

        k = 500
        U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5)

        print(s)

        #Memory consumption is dependent on kmax and less than PROPACK
        print("All done")
예제 #40
0
    def movieLens(minNnzRows=10, minNnzCols=2, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
        data = numpy.loadtxt(matrixFileName)
        X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(data[:, 2]>3, numpy.int), numpy.array(data[:, 0]-1, numpy.int32), numpy.array(data[:, 1]-1, numpy.int32), init=True)
        #X = SparseUtilsCython.centerRowsCsarray(X)   
        #X[X.nonzero()] = X.values()>0
        X.prune()
        #maxNnz = numpy.percentile(X.sum(0), quantile)
        #X = SparseUtils.pruneMatrixCols(X, minNnz=minNnzCols, maxNnz=maxNnz)
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X
    def testSaveParams(self):
        try:
            lmbda = 0.01

            alterRegressor = PrimalRidgeRegression(lmbda)
            egoRegressor = PrimalRidgeRegression(lmbda)
            predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor)

            params = [0.1, 0.2]
            paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda]
            fileName = PathDefaults.getTempDir() + "tempParams.pkl"

            predictor.saveParams(params, paramFuncs, fileName)
        except IOError as e:
            logging.warn(e)
예제 #42
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
예제 #43
0
    def profile(command, globalVars, localVars, numStats=30):
        """
        Just profile the given command with the global and local variables
        and print out the cumulative and function times. 
        """
        try:
            import pstats
            import cProfile
        except ImportError:
            raise ImportError("profile() requires pstats and cProfile")

        tempDirectory = PathDefaults.getTempDir()
        profileFileName = tempDirectory + "profile.cprof"

        logging.info("Starting to profile ...")
        cProfile.runctx(command, globalVars, localVars, profileFileName)
        logging.info("Done")
        stats = pstats.Stats(profileFileName)
        stats.strip_dirs().sort_stats("cumulative").print_stats(numStats)
        stats.strip_dirs().sort_stats("time").print_stats(numStats)
예제 #44
0
    def testPredict2(self):
        #We play around with parameters to maximise AUC on the IGF1_0-Haar data
        dataDir = PathDefaults.getDataDir()
        fileName = dataDir + "IGF1_0-Haar.npy"

        XY = numpy.load(fileName)
        X = XY[:, 0:XY.shape[1]-1]
        y = XY[:, XY.shape[1]-1].ravel()

        weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0])
        #weight = 0.5
        #weight = 0.9

        folds = 3
        decisionTree = DecisionTree()
        decisionTree.setWeight(weight)
        decisionTree.setMaxDepth(50)
        #decisionTree.setMinSplit(100)
        mean, var = decisionTree.evaluateCv(X, y, folds, Evaluator.auc)
        logging.debug("AUC = " + str(mean))
        logging.debug("Var = " + str(var))
예제 #45
0
파일: CLiMF.py 프로젝트: rezaarmand/sandbox
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data"
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])),
                     storagetype="row")
    X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))

    u = 0.1
    w = 1 - u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
예제 #46
0
        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.leafRankGenerators, True)

    def run2(self):
        logging.debug('module name:' + __name__)
        logging.debug('parent process:' +  str(os.getppid()))
        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.funcLeafRankGenerators, False)

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = 'db4'
mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode)

dataList = []
dataList.extend([(XwDb4, "db4")])

lock = multiprocessing.Lock()

numpy.random.seed(datetime.datetime.now().microsecond)