def testCategoricalToIndicator(self): X = numpy.zeros((5,5)) X[:, 0] = numpy.array([1, 1, 2, 4, 6]) X[:, 1] = numpy.array([2, 1, 2, 4, 6]) X[:, 2] = numpy.array([1, 1, 2, 4, 2]) X[:, 3] = numpy.array([1, 2, 3, 4, 2]) X[:, 4] = numpy.array([1.1, 2.1, 4.5, 6.2, 1.1]) logging.debug(X) generator = FeatureGenerator() inds = [0, 1] X2 = generator.categoricalToIndicator(X, inds) X3 = numpy.zeros((5, 11)) X3[0, :] = numpy.array([[ 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1.1]]) X3[1, :] = numpy.array([[ 1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2.1]]) X3[2, :] = numpy.array([[ 0, 1, 0, 0, 0, 1, 0, 0, 2, 3, 4.5]]) X3[3, :] = numpy.array([[ 0, 0, 1, 0, 0, 0, 1, 0, 4, 4, 6.2]]) X3[4, :] = numpy.array([[ 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 1.1]]) self.assertTrue(numpy.linalg.norm(X3-X2) < 10**-6) #Test case where no indices given inds = [] X2 = generator.categoricalToIndicator(X, inds) self.assertTrue(numpy.linalg.norm(X-X2) < 10**-6)
def readHIVGraph(self, undirected=True, indicators=True): """ We will use pacdate5389.csv which contains the data of infection. The undirected parameter instructs whether to create an undirected graph. If indicators is true then categorical varibles are turned into collections of indicator ones. """ converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv } converters[9] = CsvConverters.genderConv converters[10] = CsvConverters.orientConv converters[11] = CsvConverters.numContactsConv converters[12] = CsvConverters.numContactsConv converters[13] = CsvConverters.numContactsConv def nanProcessor(X): means = numpy.zeros(X.shape[1]) for i in range(X.shape[1]): if numpy.sum(numpy.isnan(X[:, i])) > 0: logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i])))) means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False]) X[numpy.isnan(X[:, i]), i] = means[i] return X idIndex = 0 featureIndices = converters.keys() multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor) dataDir = PathDefaults.getDataDir() vertexFileName = dataDir + "HIV/alldata.csv" edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"] sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t") #For learning purposes we will convert categorial variables into a set of #indicator features if indicators: logging.info("Converting categorial features") vList = sparseMultiGraph.getVertexList() V = vList.getVertices(list(range(vList.getNumVertices()))) catInds = [2, 3] generator = FeatureGenerator() V = generator.categoricalToIndicator(V, catInds) vList.replaceVertices(V) logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features") return sparseMultiGraph