def testReadFile(self): eCsvReader = EgoCsvReader() #logging.debug(os.getcwd()) dir = PathDefaults.getDataDir() fileName = dir + "test/TestData.csv" questionIds = [("Q14", 0), ("Q12", 1) , ("Q2", 0)] missing = 1 (X, titles) = eCsvReader.readFile(fileName, questionIds, missing) X2 = numpy.zeros((10, 3)) X2[0, :] = [0.621903386,0.608560354,0.33290608] X2[1, :] = [0.318548924,0.402390713,0.129956291] X2[2, :] = [0.956658404,0.344317772,0.680386616] X2[3, :] = [0.267607668,0.119647983,0.116893619] X2[4, :] = [0.686589498,0.402390713,0.426789174] X2[5, :] = [0.373575769,0.025846789,0.797125005] X2[6, :] = [0.493793948,0.402390713,0.990507109] X2[7, :] = [0.524534585,0.525169385,0.772917183] X2[8, :] = [0.339055395,0.402390713,0.684788001] X2[9, :] = [0.997774183,0.790801992,0.643252009] self.assertAlmostEquals(numpy.linalg.norm(X-X2),0, places=6)
def checkDistributions(): matFileName = "../../data/EgoAlterTransmissions.mat" examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) X = examplesList.getDataField("X")[:, 0:numFeatures/2] Z = examplesList.getDataField("X")[:, numFeatures/2:numFeatures] y = examplesList.getDataField("y") A = Z[y==-1, :] #Now load directly from the CSV file #Learn the distribution of the egos eCsvReader = EgoCsvReader() egoFileName = "../../data/EgoData.csv" alterFileName = "../../data/AlterData.csv" egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() (X2, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds) X2[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(X2[:, eCsvReader.ageIndex]) (mu, sigma) = Util.computeMeanVar(X) (mu2, sigma2) = Util.computeMeanVar(X2) (mu3, sigma3) = Util.computeMeanVar(Z) (mu4, sigma4) = Util.computeMeanVar(A) #Seems okay. Next check alters print(("Mean " + str(mu - mu4))) print(("Variance " + str(numpy.diag(sigma - sigma4)))) """ Analysis between the Egos in EgoData.csv and those in EgoAlterTransmissions.mat reveals that the distributions match closely. The main differences are in the means and variances in Q44A - D, but this isn't too suprising. """ """
#checkDistributions() """ We will read ego and alters data, and check they have the same values. """ egoFileName = "../../data/EgoData.csv" alterFileName = "../../data/AlterData.csv" eCsvReader = EgoCsvReader() egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() missing = 0 (egoX, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds, missing) egoX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(egoX[:, eCsvReader.ageIndex]) (alterX, titles) = eCsvReader.readFile(alterFileName, alterQuestionIds, missing) alterX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(alterX[:, eCsvReader.ageIndex]) numFeatures = egoX.shape[1] numEgoExamples = egoX.shape[0] numAlterExamples = alterX.shape[0] for i in range(0, numFeatures): (histE, uniqElementsE) = Util.histogram(egoX[:, i]) (histA, uniqElementsA) = Util.histogram(alterX[:, i]) print((str(i) + " " + str(egoQuestionIds[i]))) print(("Ego " + str(uniqElementsE)))
def testReadFiles(self): p = 0.5 eCsvReader = EgoCsvReader() eCsvReader.setP(p) dataDir = PathDefaults.getDataDir() + "infoDiffusion/" egoFileName = dataDir + "EgoData3.csv" alterFileName = dataDir + "AlterData10.csv" examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR = eCsvReader.readFiles(egoFileName, alterFileName) #logging.debug(examplesList.getDataField("X")) #Read in the ego and alter arrays (egoArray, _) = eCsvReader.readFile(egoFileName, eCsvReader.getEgoQuestionIds()) (alterArray, _) = eCsvReader.readFile(alterFileName, eCsvReader.getAlterQuestionIds()) #Make up the correct results numFeatures = examplesList.getDataFieldSize("X", 1) numPersonFeatures = numFeatures/2 #Note: no alters in this case numTransmissons = 6 X2 = numpy.zeros((numTransmissons, numFeatures)) y2 = numpy.zeros((numTransmissons, 1)) X2[0, 0:numPersonFeatures] = egoArray[0, :] X2[0, numPersonFeatures:numFeatures] = egoArray[1, :] y2[0, 0] = -1 X2[1, 0:numPersonFeatures] = egoArray[0, :] X2[1, numPersonFeatures:numFeatures] = egoArray[2, :] y2[1, 0] = -1 X2[2, 0:numPersonFeatures] = egoArray[1, :] X2[2, numPersonFeatures:numFeatures] = egoArray[0, :] y2[2, 0] = -1 X2[3, 0:numPersonFeatures] = egoArray[1, :] X2[3, numPersonFeatures:numFeatures] = egoArray[2, :] y2[3, 0] = -1 X2[4, 0:numPersonFeatures] = egoArray[2, :] X2[4, numPersonFeatures:numFeatures] = egoArray[0, :] y2[4, 0] = -1 X2[5, 0:numPersonFeatures] = egoArray[2, :] X2[5, numPersonFeatures:numFeatures] = egoArray[1, :] y2[5, 0] = -1 self.assertTrue((X2 == examplesList.getDataField("X")).all()) self.assertTrue((y2 == examplesList.getDataField("y")).all()) #Second test #================ #I modified EgoData3 so that person 2 is the same age as person 1, and # hence a homophile of 1. She (2) is excluded from the non-receivers, since #she is a homophile of person 1. p = 0 eCsvReader = EgoCsvReader() eCsvReader.setP(p) examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR = eCsvReader.readFiles(egoFileName, alterFileName) numTransmissons = 5 X2 = numpy.zeros((numTransmissons, numFeatures)) y2 = numpy.zeros((numTransmissons, 1)) X2[0, 0:numPersonFeatures] = egoArray[0, :] X2[0, numPersonFeatures:numFeatures] = egoArray[2, :] y2[0, 0] = -1 X2[1, 0:numPersonFeatures] = egoArray[1, :] X2[1, numPersonFeatures:numFeatures] = egoArray[0, :] y2[1, 0] = -1 X2[2, 0:numPersonFeatures] = egoArray[1, :] X2[2, numPersonFeatures:numFeatures] = egoArray[2, :] y2[2, 0] = -1 X2[3, 0:numPersonFeatures] = egoArray[2, :] X2[3, numPersonFeatures:numFeatures] = egoArray[0, :] y2[3, 0] = -1 X2[4, 0:numPersonFeatures] = egoArray[2, :] X2[4, numPersonFeatures:numFeatures] = egoArray[1, :] y2[4, 0] = -1 self.assertTrue((X2 == examplesList.getDataField("X")).all()) self.assertTrue((y2 == examplesList.getDataField("y")).all())