Exemplo n.º 1
0
    def testCategoricalToIndicator(self):
        X = numpy.zeros((5,5))
        X[:, 0] = numpy.array([1, 1, 2, 4, 6])
        X[:, 1] = numpy.array([2, 1, 2, 4, 6])
        X[:, 2] = numpy.array([1, 1, 2, 4, 2])
        X[:, 3] = numpy.array([1, 2, 3, 4, 2])
        X[:, 4] = numpy.array([1.1, 2.1, 4.5, 6.2, 1.1])

        logging.debug(X)

        generator = FeatureGenerator()
        inds = [0, 1]
        X2 = generator.categoricalToIndicator(X, inds)

        X3 = numpy.zeros((5, 11))
        X3[0, :] = numpy.array([[ 1,   0,   0,   0,   0,   1,   0,   0,   1,   1,   1.1]])
        X3[1, :] = numpy.array([[ 1,   0,   0,   0,   1,   0,   0,   0,   1,   2,   2.1]])
        X3[2, :] = numpy.array([[ 0,   1,   0,   0,   0,   1,   0,   0,   2,   3,   4.5]])
        X3[3, :] = numpy.array([[ 0,   0,   1,   0,   0,   0,   1,   0,   4,   4,   6.2]])
        X3[4, :] = numpy.array([[ 0,   0,   0,   1,   0,   0,   0,   1,   2,   2,   1.1]])

        self.assertTrue(numpy.linalg.norm(X3-X2) < 10**-6)

        #Test case where no indices given
        inds = []
        X2 = generator.categoricalToIndicator(X, inds)

        self.assertTrue(numpy.linalg.norm(X-X2) < 10**-6)
Exemplo n.º 2
0
    def readHIVGraph(self, undirected=True, indicators=True):
        """
        We will use pacdate5389.csv which contains the data of infection. The undirected
        parameter instructs whether to create an undirected graph. If indicators
        is true then categorical varibles are turned into collections of indicator
        ones. 
        """
        converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv }
        converters[9] = CsvConverters.genderConv
        converters[10] = CsvConverters.orientConv
        converters[11] = CsvConverters.numContactsConv
        converters[12] = CsvConverters.numContactsConv
        converters[13] = CsvConverters.numContactsConv

        def nanProcessor(X):
            means = numpy.zeros(X.shape[1])
            for i in range(X.shape[1]):
                if numpy.sum(numpy.isnan(X[:, i])) > 0:
                    logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i]))))
                means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False])
                X[numpy.isnan(X[:, i]), i] = means[i]
            return X 

        idIndex = 0
        featureIndices = converters.keys()
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor)

        dataDir = PathDefaults.getDataDir()
        vertexFileName = dataDir + "HIV/alldata.csv"
        edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"]

        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t")

        #For learning purposes we will convert categorial variables into a set of
        #indicator features
        if indicators: 
            logging.info("Converting categorial features")
            vList = sparseMultiGraph.getVertexList()
            V = vList.getVertices(list(range(vList.getNumVertices())))
            catInds = [2, 3]
            generator = FeatureGenerator()
            V = generator.categoricalToIndicator(V, catInds)
            vList.replaceVertices(V)

        logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features")

        return sparseMultiGraph