def checkDistributions(): matFileName = "../../data/EgoAlterTransmissions.mat" examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) X = examplesList.getDataField("X")[:, 0:numFeatures/2] Z = examplesList.getDataField("X")[:, numFeatures/2:numFeatures] y = examplesList.getDataField("y") A = Z[y==-1, :] #Now load directly from the CSV file #Learn the distribution of the egos eCsvReader = EgoCsvReader() egoFileName = "../../data/EgoData.csv" alterFileName = "../../data/AlterData.csv" egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() (X2, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds) X2[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(X2[:, eCsvReader.ageIndex]) (mu, sigma) = Util.computeMeanVar(X) (mu2, sigma2) = Util.computeMeanVar(X2) (mu3, sigma3) = Util.computeMeanVar(Z) (mu4, sigma4) = Util.computeMeanVar(A) #Seems okay. Next check alters print(("Mean " + str(mu - mu4))) print(("Variance " + str(numpy.diag(sigma - sigma4)))) """ Analysis between the Egos in EgoData.csv and those in EgoAlterTransmissions.mat reveals that the distributions match closely. The main differences are in the means and variances in Q44A - D, but this isn't too suprising. """ """
#checkDistributions() """ We will read ego and alters data, and check they have the same values. """ egoFileName = "../../data/EgoData.csv" alterFileName = "../../data/AlterData.csv" eCsvReader = EgoCsvReader() egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() missing = 0 (egoX, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds, missing) egoX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(egoX[:, eCsvReader.ageIndex]) (alterX, titles) = eCsvReader.readFile(alterFileName, alterQuestionIds, missing) alterX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(alterX[:, eCsvReader.ageIndex]) numFeatures = egoX.shape[1] numEgoExamples = egoX.shape[0] numAlterExamples = alterX.shape[0] for i in range(0, numFeatures): (histE, uniqElementsE) = Util.histogram(egoX[:, i]) (histA, uniqElementsA) = Util.histogram(alterX[:, i]) print((str(i) + " " + str(egoQuestionIds[i]))) print(("Ego " + str(uniqElementsE))) print(("Alter " + str(uniqElementsA)))