Python PathDefaults.getDataDir 예제들, sandbox.util.PathDefaults.PathDefaults.getDataDir Python 예제들

예제 #1

0

파일 보기

파일: DBLPDataset.py 프로젝트: charanpald/wallhack

    def __init__(self, field):
        numpy.random.seed(21)        
        
        dataDir = PathDefaults.getDataDir() + "dblp/"
        self.xmlFileName = dataDir + "dblp.xml"
        self.xmlCleanFilename = dataDir + "dblpClean.xml"        

        resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/"
        self.expertsFileName = resultsDir + "experts.txt"
        self.expertMatchesFilename = resultsDir + "experts_matches.csv"
        self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv"
        self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv"
        self.coauthorsFilename = resultsDir + "coauthors.csv"
        self.publicationsFilename = resultsDir + "publications.csv"
        
        self.stepSize = 100000
        self.numLines = 33532888
        self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"])
        self.p = 0.5     
        self.matchCutoff = 0.95
        
        
        self.cleanXML()
        self.matchExperts()
        logging.warning("Now you must disambiguate the matched experts if not ready done")

예제 #2

0

파일 보기

파일: CreateRegressionBenchmarks.py 프로젝트: charanpald/wallhack

def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ext
    
    print("Loading data from file " + fileName)
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/"

    XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters)
    X = XY[:, :-1]
    y = XY[:, -1]
    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)
    preprocessSave(X, y, outputDir, idx)

예제 #3

0

파일 보기

파일: SvmEgoSimulatorTest.py 프로젝트: charanpald/wallhack

    def testGenerateRandomGraph(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1

        
        p = 0.1
        neighbours = 10
        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)

        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)

예제 #4

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X

예제 #5

0

파일 보기

파일: BemolData.py 프로젝트: charanpald/wallhack

 def main(argv=None):
     if argv is None:
         argv = sys.argv
     try:
         # read options
         try:
             opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"])
         except getopt.error as msg:
              raise RGUsage(msg)
         # apply options
         dir = PathDefaults.getDataDir() + "cluster/"
         nb_user = None
         log_level = logging.INFO
         for o, a in opts:
             if o in ("-h", "--help"):
                 print(__doc__)
                 return 0
             elif o in ("-d", "--dir"):
                 dir = a
             elif o in ("-n", "--nb_user"):
                 nb_user = int(a)
             elif o in ("-D", "--debug"):
                 log_level = logging.DEBUG
         logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s')
         # process: generate data files
         BemolData.generate_data_file(dir, nb_user)
     except RGUsage as err:
         logging.error(err.msg)
         logging.error("for help use --help")
         return 2

예제 #6

0

파일 보기

파일: MendeleyGroupsDataset.py 프로젝트: charanpald/wallhack

    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
            
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = 1286229600
            
        self.timeStep = timedelta(30).total_seconds()             
                
        self.ratingFileName = outputDir + "data.npz"          
        self.userDictFileName = outputDir + "userIdDict.pkl"   
        self.groupDictFileName = outputDir + "groupIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
    
        self.dataDir = PathDefaults.getDataDir() + "erasm/"
        self.dataFileName = self.dataDir + "groupMembers-29-11-12" 
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 
        
        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()

예제 #7

0

파일 보기

파일: CitationIterGeneratorTest.py 프로젝트: charanpald/wallhack

    def testEdgeFile(self):
        """
        Figure out the problem with the edge file 
        """
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"

        edges = {}
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        vertices = {}

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges[(vertex1, vertex2)] = 0
            vertices[vertex1] = 0
            vertices[vertex2] = 0

        #It says there are 352807 edges in paper and 27770 vertices
        self.assertEquals(len(edges), 352807)
        self.assertEquals(len(vertices), 27770)

예제 #8

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

예제 #9

0

파일 보기

파일: GenerateToyDataTest.py 프로젝트: charanpald/wallhack

    def testToyData(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]


        pxSum = 0
        pY1XSum = 0
        pYminus1XSum = 0

        px2Sum = 0 
        squareArea = (gridPoints[1]-gridPoints[0])**2

        for i in range(gridPoints.shape[0]-1):
            for j in range(gridPoints.shape[0]-1):
                px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4
                pxSum += px*squareArea

                pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4
                pY1XSum += pY1X*squareArea

                pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4
                pYminus1XSum += pYminus1X*squareArea

                px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea

        self.assertAlmostEquals(pxSum, 1)
        print(pY1XSum)
        print(pYminus1XSum)

        self.assertAlmostEquals(px2Sum, 1)

예제 #10

0

파일 보기

파일: ModelSelectUtilsTest.py 프로젝트: charanpald/wallhack

    def testComputeIdealPenalty(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        fullX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        C = 1.0
        gamma = 1.0
        args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X)
        penalty = computeIdealPenalty(args)


        #Now compute penalty using data
        args = (trainX, trainY, testX, testY, C, gamma)
        penalty2 = computeIdealPenalty2(args)

        self.assertAlmostEquals(penalty2, penalty, 2)

예제 #11

0

파일 보기

파일: MetabolomicsUtils.py 프로젝트: charanpald/wallhack

 def __init__(self):
     self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"]
     self.dataDir = PathDefaults.getDataDir() +  "metabolomic/"
     self.boundsDict = {}
     self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573])
     self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13])
     self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])

예제 #12

0

파일 보기

파일: TreeRankTest.py 프로젝트: kentwang/sandbox

    def testPredict2(self):
        # Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array(
            [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]
        )
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0

        # The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i += 1

예제 #13

0

파일 보기

파일: DatasetStats.py 프로젝트: charanpald/wallhack

 def getIterator(): 
     dataDir = PathDefaults.getDataDir() + "cluster/"
     
     nbUser = 10000 # set to 'None' to have all users
     nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date
     startingIteration = 300
     endingIteration = 600 # set to 'None' to have all iterations
     stepSize = 1    
     
     return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)

예제 #14

0

파일 보기

파일: IterativeSpectralClusteringProfile.py 프로젝트: charanpald/sandbox

 def profileClusterFromIterator(self):
     iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList)
     dataDir = PathDefaults.getDataDir() + "cluster/"
     #iterator = getBemolGraphIterator(dataDir)
     
     def run(): 
         clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True)
         print(timeList.cumsum(0))
         
     ProfileUtils.profile('run()', globals(), locals())

예제 #15

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

 def syntheticDataset2(): 
     """
     Create a simple synthetic dataset using a power law distribution on users and items 
     """
     resultsDir = PathDefaults.getDataDir() + "syntheticRanking/"
     matrixFileName = resultsDir + "dataset1.mtx" 
     
     X = sppy.io.mmread(matrixFileName, storagetype="row")
     
     return X

예제 #16

0

파일 보기

파일: Static2IdValDatasetTest.py 프로젝트: charanpald/wallhack

    def testGetTrainIteratorFunc(self):
        dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" 
        dataset = Static2IdValDataset(dataFilename)

        trainIterator = dataset.getTrainIteratorFunc()()      
        testIterator = dataset.getTestIteratorFunc()()
        
        for trainX in trainIterator: 
            testX = testIterator.next() 
            
            print(trainX.shape, trainX.nnz, testX.nnz)
            self.assertEquals(trainX.shape, testX.shape)

예제 #17

0

파일 보기

파일: CreateRegressionBenchmarks.py 프로젝트: charanpald/wallhack

def processParkinsonsDataset(name, numRealisations):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ".data"
    

    XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1)
    inds = list(set(range(XY.shape[1])) - set([5, 6]))
    X = XY[:, inds]

    y1 = XY[:, 5]
    y2 = XY[:, 6]
    #We don't keep whole collections of patients
    split = 0.5

    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)

    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/"
    preprocessSave(X, y1, outputDir, idx)
    
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/"
    preprocessSave(X, y2, outputDir, idx)

예제 #18

0

파일 보기

    def profileClusterFromIterator(self):
        iterator = IncreasingSubgraphListIterator(self.graph,
                                                  self.subgraphIndicesList)
        dataDir = PathDefaults.getDataDir() + "cluster/"

        #iterator = getBemolGraphIterator(dataDir)

        def run():
            clusterList, timeList, boundList = self.clusterer.clusterFromIterator(
                iterator, verbose=True)
            print(timeList.cumsum(0))

        ProfileUtils.profile('run()', globals(), locals())

예제 #19

0

파일 보기

    def profileSvd2(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = RandomisedSVD.svd(trainX, k)

        print(s)

        print("All done")

예제 #20

0

파일 보기

파일: RandomisedSvdProfile.py 프로젝트: charanpald/sandbox

 def profileSvd2(self):
     dataDir = PathDefaults.getDataDir() + "erasm/contacts/" 
     trainFilename = dataDir + "contacts_train"        
     
     trainX = scipy.io.mmread(trainFilename)
     trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)
     
     k = 500 
     U, s, V = RandomisedSVD.svd(trainX, k)
     
     print(s)
     
     print("All done")

예제 #21

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

 def mendeley2(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Document"):
     authorAuthorFileName = PathDefaults.getDataDir() + "reference/author" + dataset + "Matrix.mtx"
     logging.debug("Reading file: " + authorAuthorFileName)
     X = sppy.io.mmread(authorAuthorFileName, storagetype="row")
             
     logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + authorAuthorFileName)
     logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     return X

예제 #22

0

파일 보기

파일: TreeRankTest.py 프로젝트: rezaarmand/sandbox

    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName,
                           skiprows=1,
                           usecols=(1, 2, 3),
                           delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2] * 2 - 1

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName,
                               skiprows=1,
                               usecols=(1, 2, 3),
                               delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2] * 2 - 1

        #X = Standardiser().standardiseArray(X)
        #testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([
            0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508,
            0.7367508, 0.7367508
        ])
        testAucs = numpy.array([
            0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400,
            0.6874400, 0.6874400
        ])
        i = 0

        #The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(DecisionTree)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            #print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            #self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            #self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i += 1

        #Compare tree to that of R version
        tree = treeRank.getTree()

예제 #23

0

파일 보기

    def profilePropackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5)

        print(s)

        #Memory consumption is dependent on kmax
        print("All done")

예제 #24

0

파일 보기

파일: SparseUtilsProfile.py 프로젝트: kentwang/sandbox

    def profilePropackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8)

        k = 500
        U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5)

        print(s)

        # Memory consumption is dependent on kmax
        print("All done")

예제 #25

0

파일 보기

파일: ModelSelectUtilsTest.py 프로젝트: charanpald/wallhack

    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()

예제 #26

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

    def movieLens(minNnzRows=10, minNnzCols=2, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
        data = numpy.loadtxt(matrixFileName)
        X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(data[:, 2]>3, numpy.int), numpy.array(data[:, 0]-1, numpy.int32), numpy.array(data[:, 1]-1, numpy.int32), init=True)
        #X = SparseUtilsCython.centerRowsCsarray(X)   
        #X[X.nonzero()] = X.values()>0
        X.prune()
        #maxNnz = numpy.percentile(X.sum(0), quantile)
        #X = SparseUtils.pruneMatrixCols(X, minNnz=minNnzCols, maxNnz=maxNnz)
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

예제 #27

0

파일 보기

파일: SparseUtilsProfile.py 프로젝트: kentwang/sandbox

    def profileArpackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32)
        print(trainX.dtype.char, trainX.dtype)

        k = 500
        U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5)

        print(s)

        # Memory consumption is dependent on kmax and less than PROPACK
        print("All done")

예제 #28

0

파일 보기

    def profileArpackSvd(self):
        dataDir = PathDefaults.getDataDir() + "erasm/contacts/"
        trainFilename = dataDir + "contacts_train"

        trainX = scipy.io.mmread(trainFilename)
        trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32)
        print(trainX.dtype.char, trainX.dtype)

        k = 500
        U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5)

        print(s)

        #Memory consumption is dependent on kmax and less than PROPACK
        print("All done")

예제 #29

0

파일 보기

파일: BemolData.py 프로젝트: charanpald/wallhack

def cluster():
    k1 = 20 # numCluster to learn
    k2 = 40 # numEigenVector kept

    dir = PathDefaults.getDataDir() + "cluster/"
    graphIterator = getBemolGraphIterator(dir)
    #===========================================
    # cluster
    print("compute clusters")
    clusterer = IterativeSpectralClustering(k1, k2)
    clustersList = clusterer.clusterFromIterator(graphIterator, True)

    for i in range(len(clustersList)):
              clusters = clustersList[i]
              print(clusters)

예제 #30

0

파일 보기

파일: SvmEgoSimulatorTest.py 프로젝트: charanpald/wallhack

    def testRunSimulation(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1
        p = 0.1
        neighbours = 10

        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)
        
        CVal = 1.0
        kernel = "linear"
        kernelParamVal = 0.0
        errorCost = 0.5
        folds = 6
        sampleSize = 1000

        maxIterations = 5

        self.svmEgoSimulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize)
        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
        self.svmEgoSimulator.runSimulation(maxIterations)

예제 #31

0

파일 보기

파일: HIVGraphReader.py 프로젝트: charanpald/wallhack

    def readHIVGraph(self, undirected=True, indicators=True):
        """
        We will use pacdate5389.csv which contains the data of infection. The undirected
        parameter instructs whether to create an undirected graph. If indicators
        is true then categorical varibles are turned into collections of indicator
        ones. 
        """
        converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv }
        converters[9] = CsvConverters.genderConv
        converters[10] = CsvConverters.orientConv
        converters[11] = CsvConverters.numContactsConv
        converters[12] = CsvConverters.numContactsConv
        converters[13] = CsvConverters.numContactsConv

        def nanProcessor(X):
            means = numpy.zeros(X.shape[1])
            for i in range(X.shape[1]):
                if numpy.sum(numpy.isnan(X[:, i])) > 0:
                    logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i]))))
                means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False])
                X[numpy.isnan(X[:, i]), i] = means[i]
            return X 

        idIndex = 0
        featureIndices = converters.keys()
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor)

        dataDir = PathDefaults.getDataDir()
        vertexFileName = dataDir + "HIV/alldata.csv"
        edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"]

        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t")

        #For learning purposes we will convert categorial variables into a set of
        #indicator features
        if indicators: 
            logging.info("Converting categorial features")
            vList = sparseMultiGraph.getVertexList()
            V = vList.getVertices(list(range(vList.getNumVertices())))
            catInds = [2, 3]
            generator = FeatureGenerator()
            V = generator.categoricalToIndicator(V, catInds)
            vList.replaceVertices(V)

        logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features")

        return sparseMultiGraph

예제 #32

0

파일 보기

파일: DatasetUtils.py 프로젝트: charanpald/wallhack

 def mendeley(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Doc", sigma=0.05, indicator=True):
     authorAuthorFileName = PathDefaults.getDataDir() + "reference/authorAuthor"+ dataset + "Matrix_sigma=" + str(sigma) + ".mtx"
     logging.debug("Reading file: " + authorAuthorFileName)
     X = sppy.io.mmread(authorAuthorFileName, storagetype="row")
     
     if indicator: 
         X[X.nonzero()] = 1
         X.prune()
     
     logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + authorAuthorFileName)
     logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     return X

예제 #33

0

파일 보기

파일: ContactsRecommenderExp.py 프로젝트: charanpald/wallhack

def saveResults(orderedItems, scores, dataset, similaritiesFileName, contactsFilename, interestsFilename, minScore, minContacts, minAcceptableSims): 
    #Now let's write out the similarities file 
    logging.debug("Generating recommendations for authors")
    authorIndexerFilename = PathDefaults.getDataDir() + "reference/authorIndexer" + dataset + ".pkl"
    authorIndexerFile = open(authorIndexerFilename)
    authorIndexer = pickle.load(authorIndexerFile)
    authorIndexerFile.close()
    logging.debug("Loaded author indexer")
    
    reverseIndexer = authorIndexer.reverseTranslateDict()
    
    outputFile = open(similaritiesFileName, "w")
    csvFile = csv.writer(outputFile, delimiter='\t')
    
    for i in range(orderedItems.shape[0]):
        if i % 10000 == 0 : 
            logging.debug("Iteration: " + str(i))
            
        row = [reverseIndexer[i]]                
        
        #Check author isn't recommended him/herself
        for j in range(orderedItems.shape[1]): 
            if orderedItems[i, j] != i:
                row = [reverseIndexer[i], reverseIndexer[orderedItems[i, j]], scores[i, j]]
            
                csvFile.writerow(row)
            
    outputFile.close()
    logging.debug("Wrote recommendations to " + similaritiesFileName)
    
    #Figure out how good the recommendations are on the contacts network                      
    contacts = read_contacts(contactsFilename)
    research_interests = read_interests(interestsFilename)
    sims = read_similar_authors(similaritiesFileName, minScore)
    
    logging.debug('Evaluating against contacts...')
    meanStatsContacts = evaluate_against_contacts(sims, contacts, minContacts)
    
    logging.debug('Evaluating against research interests...') 
    meanStatsInterests = evaluate_against_research_interests(sims, research_interests, minAcceptableSims)
    
    logging.debug("Mean stats on contacts: " + str(meanStatsContacts))
    logging.debug("Mean stats on interests:" + str(meanStatsInterests))
    
    return meanStatsContacts, meanStatsInterests

예제 #34

0

파일 보기

파일: TreeRankForestTest.py 프로젝트: rezaarmand/sandbox

    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1

예제 #35

0

파일 보기

파일: DecisionTreeTest.py 프로젝트: rezaarmand/sandbox

    def testPredict2(self):
        #We play around with parameters to maximise AUC on the IGF1_0-Haar data
        dataDir = PathDefaults.getDataDir()
        fileName = dataDir + "IGF1_0-Haar.npy"

        XY = numpy.load(fileName)
        X = XY[:, 0:XY.shape[1]-1]
        y = XY[:, XY.shape[1]-1].ravel()

        weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0])
        #weight = 0.5
        #weight = 0.9

        folds = 3
        decisionTree = DecisionTree()
        decisionTree.setWeight(weight)
        decisionTree.setMaxDepth(50)
        #decisionTree.setMinSplit(100)
        mean, var = decisionTree.evaluateCv(X, y, folds, Evaluator.auc)
        logging.debug("AUC = " + str(mean))
        logging.debug("Var = " + str(var))

예제 #36

0

파일 보기

파일: RandomForestTest.py 프로젝트: charanpald/sandbox

    def testPredict2(self):
        #We play around with parameters to maximise AUC on the IGF1_0-Haar data
        dataDir = PathDefaults.getDataDir()
        fileName = dataDir + "IGF1_0-Haar.npy"

        XY = numpy.load(fileName)
        X = XY[:, 0:XY.shape[1]-1]
        y = XY[:, XY.shape[1]-1].ravel()

        weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0])
        #weight = 0.5
        #weight = 0.9

        folds = 3
        randomForest = RandomForest()
        randomForest.setWeight(weight)
        randomForest.setMaxDepth(50)
        #randomForest.setMinSplit(100)
        mean, var = randomForest.evaluateCv(X, y, folds, Evaluator.auc)
        logging.debug("AUC = " + str(mean))
        logging.debug("Var = " + str(var))

예제 #37

0

파일 보기

파일: CLiMF.py 프로젝트: rezaarmand/sandbox

def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data"
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])),
                     storagetype="row")
    X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))

    u = 0.1
    w = 1 - u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)

예제 #38

0

파일 보기

        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.leafRankGenerators, True)

    def run2(self):
        logging.debug('module name:' + __name__)
        logging.debug('parent process:' +  str(os.getppid()))
        logging.debug('process id:' +  str(os.getpid()))

        self.saveResults(self.funcLeafRankGenerators, False)

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.debug("Running from machine " + str(gethostname()))
numpy.random.seed(21)

dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = 'db4'
mode = "cpd"
level = 10
XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode)
XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode)
XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode)

dataList = []
dataList.extend([(XwDb4, "db4")])

lock = multiprocessing.Lock()

numpy.random.seed(datetime.datetime.now().microsecond)