Exemplo n.º 1
0
 def testGraphFromMatFile(self):
     matFileName = PathDefaults.getDataDir() +  "infoDiffusion/EgoAlterTransmissions1000.mat"
     sGraph = EgoUtils.graphFromMatFile(matFileName)
     
     examplesList = ExamplesList.readFromMatFile(matFileName)
     numFeatures = examplesList.getDataFieldSize("X", 1)
     
     self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges())
     self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices())
     self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures())
     
     #Every even vertex has information, odd does not 
     for i in range(0, sGraph.getNumVertices()): 
         vertex = sGraph.getVertex(i)
         
         if i%2 == 0: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1)
         else: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0)
             
     #Test the first few vertices are the same 
     for i in range(0, 10): 
         vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2]
         vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2]
         vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2]
         vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures]
         
         self.assertTrue((vertex1 == vertexEx1).all())
         self.assertTrue((vertex2 == vertexEx2).all())
Exemplo n.º 2
0
    def setUp(self):
        numVertices = 500
        numFeatures = 49 
        
        self.means = rand.randn(numFeatures)
        self.vars = rand.randn(numFeatures, numFeatures)
        self.vars = self.vars + self.vars.T #Make vars symmetric
        p1 = 0.1
        
        self.egoGenerator = EgoGenerator()
        vList = self.egoGenerator.generateIndicatorVertices(numVertices, self.means, self.vars, p1)
        sGraph = SparseGraph(vList)
        
        p2 = 0.1 
        k = 5 
        
        #Create the graph edges according to the small world model 
        graphGen = SmallWorldGenerator(p2, k)
        self.sGraph = graphGen.generate(sGraph)

        dataDir = PathDefaults.getDataDir() + "infoDiffusion/"
        matFileName = dataDir + "EgoAlterTransmissions1000.mat"
        sampleSize = 100
        egoAlterExamplesList = ExamplesList.readFromMatFile(matFileName)
        egoAlterExamplesList.setDefaultExamplesName("X")
        egoAlterExamplesList.setLabelsName("y")
        egoAlterExamplesList.randomSubData(sampleSize)

        X = egoAlterExamplesList.getDataField("X")
        y = egoAlterExamplesList.getDataField("y")

        #Now learn using NaiveBayes
        self.nb = NaiveBayes()
        self.nb.learnModel(X, y)
    
        self.egoSimulator = EgoSimulator(self.sGraph, self.nb)

        #Define a classifier which predicts transfer if gender is female
        class DummyClassifier(object):
            def __init(self):
                pass

            def classify(self, X):
                y = numpy.zeros((X.shape[0]))

                for i in range(X.shape[0]):
                    if X[i, 0] == 0:
                        y[i] = 1
                    else:
                        y[i] = -1
                return y

        self.dc = DummyClassifier()
Exemplo n.º 3
0
    def readFiles(self, egoFileName, alterFileName, missing=0):
        (egoArray, egoTitles) = self.readFile(egoFileName, self.egoQuestionIds, missing)
        (alterArray, alterTitles) = self.readFile(alterFileName, self.alterQuestionIds, missing)

        #Augment receivers with new information
        egoAlterQuestionIds = self.__getAlterQuestionIds()
        alterFieldIndices = self.getAlterFieldIndices()

        (egoAlterArray, egoAlterTitles) = self.readFile(egoFileName, egoAlterQuestionIds, missing)
        (receiversArray, egoIndicesR, alterIndices) = self.generateReceivers(egoAlterArray, alterArray, alterFieldIndices)

        #Make sure we count receivers for all egos 
        receiverCounts = numpy.zeros(egoArray.shape[0], numpy.int)
        if egoIndicesR.shape[0] !=0:
            binCount = numpy.bincount(egoIndicesR)
        else:
            binCount = numpy.array([])
        receiverCounts[0:binCount.shape[0]] = binCount 

        #Generate non-receivers 
        numContactsIndices = [self.numFriendsIndex, self.numColleaguesIndex, self.numFamilyIndex, self.numAquantancesIndex]
        homophileIndexPairs = [(self.homophileAgeIndex, self.ageIndex), (self.homophileGenderIndex, self.genderIndex)]
        homophileIndexPairs.extend([(self.homophileEducationIndex, self.educationIndex), (self.homophileIncomeIndex, self.incomeIndex)])

        (nonReceiversArray, egoIndicesNR, alterIndicesNR) = self.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)
        
        #Now, we generate all pairs of senders/non-senders and receivers/non-receivers 
        numExamples = nonReceiversArray.shape[0] + receiversArray.shape[0]
        numPersonFeatures = egoArray.shape[1]
        numFeatures = numPersonFeatures*2
        
        X = numpy.zeros((numExamples, numFeatures))
        y = numpy.zeros(numExamples, numpy.int32)

        for i in range(0, numExamples): 
            if i < nonReceiversArray.shape[0]: 
                X[i, 0:numPersonFeatures] = egoArray[egoIndicesNR[i], :]
                X[i, numPersonFeatures:numFeatures] = nonReceiversArray[i, :]
                y[i] = -1
            else:
                j = i - nonReceiversArray.shape[0]
                X[i, 0:numPersonFeatures] = egoArray[egoIndicesR[j], :]
                X[i, numPersonFeatures:numFeatures] = receiversArray[j, :]
                y[i] = 1
                
        examplesList = ExamplesList(numExamples)
        examplesList.addDataField("X", X)
        examplesList.addDataField("y", y)
        examplesList.setDefaultExamplesName("X")
        examplesList.setLabelsName("y")

        return examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR 
Exemplo n.º 4
0
def checkDistributions():
    matFileName = "../../data/EgoAlterTransmissions.mat"
    examplesList = ExamplesList.readFromMatFile(matFileName)

    numFeatures = examplesList.getDataFieldSize("X", 1)
    X = examplesList.getDataField("X")[:, 0:numFeatures/2]
    Z = examplesList.getDataField("X")[:, numFeatures/2:numFeatures]
    y = examplesList.getDataField("y")
    A = Z[y==-1, :]

    #Now load directly from the CSV file
    #Learn the distribution of the egos
    eCsvReader = EgoCsvReader()
    egoFileName = "../../data/EgoData.csv"
    alterFileName = "../../data/AlterData.csv"
    egoQuestionIds = eCsvReader.getEgoQuestionIds()
    alterQuestionIds = eCsvReader.getAlterQuestionIds()
    (X2, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds)
    X2[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(X2[:, eCsvReader.ageIndex])

    (mu, sigma) = Util.computeMeanVar(X)
    (mu2, sigma2) = Util.computeMeanVar(X2)
    (mu3, sigma3) = Util.computeMeanVar(Z)
    (mu4, sigma4) = Util.computeMeanVar(A)

    #Seems okay. Next check alters
    print(("Mean " + str(mu - mu4)))
    print(("Variance " + str(numpy.diag(sigma - sigma4))))

    """
    Analysis between the Egos in EgoData.csv and those in EgoAlterTransmissions.mat
    reveals that the distributions match closely. The main differences are
    in the means and variances in Q44A - D, but this isn't too suprising.
    """

    """
Exemplo n.º 5
0
    def testAdvanceGraph3(self):
        """ 
        This test will learn from a set of ego and alter pairs, then we will make predictions on 
        the pairs and see the results. The we test if the same results are present in a simulation.  
        """
        dataDir = PathDefaults.getDataDir() + "infoDiffusion/"
        matFileName = dataDir +  "EgoAlterTransmissions1000.mat"
        examplesList = ExamplesList.readFromMatFile(matFileName)
        examplesList.setDefaultExamplesName("X")
        examplesList.setLabelsName("y")
        
        logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1))))
        logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1))))
        
        #Standardise the examples 
        preprocessor = Standardiser()
        X = examplesList.getDataField(examplesList.getDefaultExamplesName())
        X = preprocessor.standardiseArray(X)
        examplesList.overwriteDataField(examplesList.getDefaultExamplesName(), X)
        
        classifier = MlpySVM(kernel='linear', kp=1, C=32.0)

        y = examplesList.getDataField("y")
        classifier.learnModel(X, y)
        predY = classifier.classify(X)
        logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1))))
        logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1))))

        sampledY = examplesList.getSampledDataField(examplesList.getLabelsName()).ravel()

        error = mlpy.err(sampledY, predY)
        sensitivity = mlpy.sens(sampledY, predY)
        specificity = mlpy.spec(sampledY, predY)
        errorP = mlpy.errp(sampledY, predY)
        errorN = mlpy.errn(sampledY, predY)
        
        logging.debug("--- Classification evaluation ---")
        logging.debug(("Error on " + str(examplesList.getNumExamples()) + " examples is " + str(error)))
        logging.debug(("Sensitivity (recall = TP/(TP+FN)): " + str(sensitivity)))
        logging.debug(("Specificity (TN/TN+FP): "  + str(specificity)))
        logging.debug(("Error on positives: "  + str(errorP)))
        logging.debug(("Error on negatives: "  + str(errorN)))
        
        sGraph = EgoUtils.graphFromMatFile(matFileName)

        #Notice that the data is preprocessed in the same way as the survey data 
        egoSimulator = EgoSimulator(sGraph, classifier, preprocessor)
        
        totalInfo = EgoUtils.getTotalInformation(sGraph)
        logging.debug(("Total number of people with information: " + str(totalInfo)))
        self.assertEquals(totalInfo, 1000)
        
        sGraph = egoSimulator.advanceGraph()
        
        totalInfo = EgoUtils.getTotalInformation(sGraph)
        logging.debug(("Total number of people with information: " + str(totalInfo)))
        self.assertEquals(totalInfo, 1000 + sum(predY == 1))
        
        altersList = egoSimulator.getAlters(0)
        predictedAlters = numpy.nonzero(predY == 1)[0]
        
        self.assertTrue((altersList == predictedAlters*2+1).all())