def plot_dens_diversity(counter_list, color, label, ls): diversity = [len(counter.keys()) for counter, _ in counter_list] pdf, grid, _ = kde(diversity, cut=0) min_lim = 0 max_lim = max(diversity) plt.xlim(0, max_lim) plt.plot(grid, pdf, color=color, ls=ls) plt.xlabel( "Normalised diversity of semantic categories between drug and disease") plt.ylabel("Probability density") patch = mlines.Line2D([], [], ls=ls, color=color, label=label) return patch
def plot_dens(cat, counter_list, color, label, ls): get_track_key = partial_second(Counter.__getitem__, cat) track = list(map(get_track_key, counter_list)) min_lim = 0 max_lim = np.percentile(track, 90) plt.xlim(0, max_lim) pdf, grid, _ = kde(track, cut=0) plt.plot(grid, pdf, color=color, ls=ls) plt.xlabel( "Normalized number of semantic category occurence between drug and disease" ) plt.ylabel("Probability density") patch = mlines.Line2D([], [], ls=ls, color=color, label=label) return patch
def checkPopulation(cfgname, additionalArgs={}): cfg_module = importlib.import_module(cfgname) cfg = cfg_module.cfg cfg.update(additionalArgs) print("Loading configuration...") # Scale of the population to save # Fraction between 0 and 1 popScale = cfg["popScale"] populationFileName = cfg["populationFileName"] referenceName = cfg["referenceName"] selectedNUTS = cfg["selectedNUTS"] countryCode = list(selectedNUTS)[0][:2] NUTS3code = list(selectedNUTS)[0][:3] minimumPopulationPerWPkind = cfg["minimumPopulationPerWPkind"] workplacesDict = { k: synpopStructures.workplace(k) for k in minimumPopulationPerWPkind } universityDistrib = { "BINS": universityBins, "CDF": (np.diff(universityBins) * universityPDF).cumsum(), "PDF": universityPDF, } workplacesDict[3].set_sizePDF(universityDistrib) ################################################# ################################################# ageBySex_PDF = pd.read_pickle(cfg["ageBySex_PDF_file"]) ageBySex_CDF = pd.read_pickle(cfg["ageBySex_CDF_file"]) # We also load the data for nuts3 levels (we use them to evaluate the number of # agents to create) print("Loading Eurostat data...") popBroadAgeBySex_NUTS3 = pd.read_pickle(cfg["popBroadAgeBySex_NUTS3_file"]) # The df containing the distribution of household kind per NUTS2 householdKind_PDF = pd.read_pickle(cfg["householdKind_PDF_file"]) householdKind_CDF = pd.read_pickle(cfg["householdKind_CDF_file"]) # The df containing the size distribution for each kind of household householdSizeByKind_PDF = pd.read_pickle( cfg["householdSizeByKind_PDF_file"]) householdSizeByKind_CDF = pd.read_pickle( cfg["householdSizeByKind_CDF_file"]) # The df containing the age distribution of the population per household type (i.e. # which age have the components of households) ageByHHrole_PDF = pd.read_pickle(cfg["ageByHHrole_PDF_file"]) ageByHHrole_CDF = pd.read_pickle(cfg["ageByHHrole_CDF_file"]) ageByHHrole_RAW = pd.read_pickle(cfg["ageByHHrole_RAW_file"]) # ### Work and education # The commuting probabilities for each NUTS3 studyCommuting_df = pd.read_pickle(cfg["studyCommuting_df_file"]) workCommuting_df = pd.read_pickle(cfg["workCommuting_df_file"]) # Education indicator... educationLevelByAge_PDF = pd.read_pickle( cfg["educationLevelByAge_PDF_file"]) educationLevelByAge_CDF = pd.read_pickle( cfg["educationLevelByAge_CDF_file"]) # School and employment rate given education... schoolAttendanceRate_df = pd.read_pickle( cfg["schoolAttendanceRate_df_file"]) employmentBySexAgeEdu_df = pd.read_pickle( cfg["employmentBySexAgeEdu_df_file"]) # Schools and wp size... schoolSize_df = pd.read_pickle(cfg["schoolSize_df_file"]) workplSize_df = pd.read_pickle(cfg["workplSize_df_file"]) # Geodataframe print("Loading boundaries data...") geoDataFrame = pickle.load(gzip.open(cfg["geoDataFrame_file"], "rb")) geoDFid2nuts = pickle.load(open(cfg["geoDFid2nuts_file"], "rb")) geoDFnuts2id = pickle.load(open(cfg["geoDFnuts2id_file"], "rb")) reference_gdf_tot = geoDataFrame[-1] selectedNUTS = cfg["selectedNUTS"] def filterL0codes(l0, toKeep, id2code): tmp_code = id2code[l0] for keep in toKeep: if tmp_code.startswith(keep): return True return False reference_gdf = reference_gdf_tot[reference_gdf_tot["l0"].apply( lambda v: filterL0codes(v, selectedNUTS, geoDFid2nuts))].copy() # Household labels: the ones for the age structure and the ones for the actual household kinds ageHouseholdLabels = set( ageByHHrole_RAW.columns.get_level_values(1).unique()) ageHouseholdLabels.discard("TOTAL") householdLabels = set(householdKind_PDF.columns) householdLabels.discard("TOTAL") # Couple without children CPL_NCH = synpopStructures.householdType(minMaxParents=(2, 2), minMaxSons=(0, 0), ageMinMaxParents=(18, 100), sexParents="etero", agePDFparents=None, agePDFsons=None) # Couple with young and/or old children CPL_WCH = synpopStructures.householdType( minMaxParents=(2, 2), minMaxSons=(1, 9), ageMinMaxParents=(18, 100), ageMinMaxChildren=(0, 80), dMinMaxParSon=(18, 50), sexParents="etero", sexSons="free", agePDFparents=None, agePDFsons=None, ) # Lone father/mother with young/old children M1_CH = synpopStructures.householdType( minMaxParents=(1, 1), minMaxSons=(1, 10), ageMinMaxParents=(18, 100), ageMinMaxChildren=(0, 80), dMinMaxParSon=(18, 50), sexParents="male", sexSons="free", agePDFparents=None, agePDFsons=None, ) F1_CH = synpopStructures.householdType( minMaxParents=(1, 1), minMaxSons=(1, 10), ageMinMaxParents=(18, 100), ageMinMaxChildren=(0, 80), dMinMaxParSon=(18, 50), sexParents="female", sexSons="free", agePDFparents=None, agePDFsons=None, ) # Singles and multihouseholds share the same age distribution (but different size) A1_HH = synpopStructures.householdType( minMaxParents=(1, 1), minMaxSons=(0, 0), ageMinMaxParents=(15, 100), sexParents="free", agePDFparents=None, agePDFsons=None, ) MULTI_HH = synpopStructures.householdType( minMaxParents=(2, 11), minMaxSons=(0, 0), ageMinMaxParents=(15, 100), sexParents="free", dMinMaxp1p2=(0, 40), dMinMaxParSon=(30, 100), fixedParentsSons=(False, True), agePDFparents=None, agePDFsons=None, ) # Save the households in an array and in a dictionary to remember their order. # We also save the column from which they will inherit the parent and sons # age PDF from the aggregation. houseHoldTypeDict = { "CPL_NCH": { "obj": CPL_NCH, "id": None, 'parentAgePDFName': "CPL_XCH", 'childsAgePDFName': None, }, "CPL_WCH": { "obj": CPL_WCH, "id": None, 'parentAgePDFName': "CPL_XCH", 'childsAgePDFName': "CH_PAR", }, "M1_CH": { "obj": M1_CH, "id": None, 'parentAgePDFName': "A1_XCH", 'childsAgePDFName': "CH_PAR", }, "F1_CH": { "obj": F1_CH, "id": None, 'parentAgePDFName': "A1_XCH", 'childsAgePDFName': "CH_PAR", }, "A1_HH": { "obj": A1_HH, "id": None, 'parentAgePDFName': "A1_HH", 'childsAgePDFName': None, }, "MULTI_HH": { "obj": MULTI_HH, "id": None, 'parentAgePDFName': "A1_HH", 'childsAgePDFName': None, }, } nHouseholdKinds = len(householdLabels) houseHoldTypeArray = [None] * nHouseholdKinds for idx, hhLabel in enumerate(householdLabels): tmp_householdEntry = houseHoldTypeDict[hhLabel] tmp_householdEntry["id"] = idx houseHoldTypeArray[idx] = tmp_householdEntry["obj"] houseHoldTypeArray = np.array(houseHoldTypeArray) # Open the reference file and load the three tables f = h5py.File(populationFileName, "r") loaded_array = dict() agDSname = cfg["agentDatasetName"] hhDSname = cfg["hhDatasetName"] wpDSname = cfg["wpDatasetName"] for dataset_name in (agDSname, hhDSname, wpDSname): dataset = f[dataset_name] tmp_array = np.empty(shape=dataset.shape, dtype=dataset.dtype) dataset.read_direct(tmp_array) loaded_array[dataset_name] = tmp_array f.close() ags = loaded_array[agDSname] wps = loaded_array[wpDSname] hhs = loaded_array[hhDSname] # Check the generated population print("Checking population...") # ## Commuting # # We check the distance distribution between home and workplace/school. # # We compare our findings with the theoretical curve of # # $P(d) \propto \frac{1}{(1+d/a)^b}$ # # with $a=3.8$ km and $b=2.32$. # # Since we are generating only one region we do not reproduce the tail of the distribution as we are missing long distances travels. However it is clear that we are correctly reproducing the travel distances as the generated distribution closely follows the reference one in the $d\lesssim 50$ km. # Compute the distance matrix... baricenters_LatLon = np.array( reference_gdf[["BARICENTER_Y", "BARICENTER_X"]]) baricenters_distanceM = squareform( pdist(baricenters_LatLon, metric=haversine)) plt.imshow(baricenters_distanceM) cbar = plt.colorbar(shrink=.85) cbar.set_label(r"Distance - $d_{ij}\; (km)$", size=22) cbar.ax.tick_params(labelsize=16) plt.xlabel(r"Destination - $j$", size=18) plt.ylabel(r"Origin - $i$", size=18) plt.xticks(size=16) plt.yticks(size=16) plt.tight_layout() plt.savefig("figures/synPop_%s_01_LAU2distanceMatrix.pdf" % (referenceName, ), bbox_inches="tight") plt.close() # Commuting distance commutingDistancesPerWPkind = {k: [] for k in workplacesDict} for wp_kind in np.unique(ags["employed"]): #, commuters in generatedAgents_DF.groupby("employed"): print wp_kind if wp_kind < 0: continue commuters = ags[ags["employed"] == wp_kind] tmp_dists = np.zeros(commuters.shape[0]) iii = 0 for commuter in commuters: hh_id = commuter["hh"] wp_id = commuter["wp"] tmp_hh = hhs[hh_id] tmp_wp = wps[wp_id] tmp_dists[iii] = haversine((tmp_hh["lat"], tmp_hh["lon"]), (tmp_wp["lat"], tmp_wp["lon"])) iii += 1 commutingDistancesPerWPkind[wp_kind] = tmp_dists fig = plt.figure(figsize=(5, 4)) iii = 1 for wp_kind, data in commutingDistancesPerWPkind.iteritems(): f, b = np.histogram(data, bins=np.logspace(0, 3, 30), density=True) b = (b[1:] + b[:-1]) / 2. b = b[f > 0] f = f[f > 0] plt.loglog(b, f, label=wp_kind) # Plot the reference one... b = np.array(list(b) + [max(100, b[-1] * 1.5)]) plt.loglog(b, .5 * (1. + b / 3.8)**-2.32, "--k", lw=2, label=r"$Thr$") plt.xscale("log") plt.yscale("log") plt.xlim(5, b[-1]) plt.xlabel(r"Distance $d$ (km)", size=18) plt.ylabel(r"$P(d)$", size=18) plt.xticks(size=16) plt.yticks(size=16) plt.legend(fontsize=14, ncol=3, loc=1, handlelength=.5, labelspacing=.7, columnspacing=.5) plt.tight_layout() plt.savefig("figures/synPop_%s_02_CommutingDistancesPDF.pdf" % (referenceName, ), bbox_inches="tight") plt.close() ## Workplaces size distribution # Since we generated only the 10% of the population we get a lot of small workplaces # but the overall shape of the distribution is reproduced. # Workplace kind... # The generated data... nWorkplaceKind = len(minimumPopulationPerWPkind.keys()) nCols = 3 nRows = nWorkplaceKind // nCols + 1 plt.subplots(nRows, nCols, figsize=(4.5 * nCols, 5 * nRows)) for subplot, wp_kind in enumerate(sorted(minimumPopulationPerWPkind)): if wp_kind < 10: if wp_kind < 3: reference_data = schoolSize_df.loc[countryCode] else: reference_data = universityDistrib else: reference_data = workplSize_df.loc[NUTS3code] plt.subplot(nRows, nCols, subplot + 1) plt.title("WP kind = %d" % wp_kind) f, b = np.histogram(wps[wps["kind"] == wp_kind]["size"], bins=reference_data["BINS"], density=True) if len(b) == len(f) + 1: b = (b[1:] + b[:-1]) / 2. b = b[f > 1e-6] f = f[f > 1e-6] plt.plot(f, "^-C1", label="Generated") #plt.plot((reference_data["BINS"][1:] +reference_data["BINS"][:-1])/2., plt.plot(reference_data["PDF"], "o-C0", label="Actual data") plt.xscale("linear") plt.yscale("log") tmp_empiricalBNS = reference_data["BINS"] nBins = len(tmp_empiricalBNS) step = 1 if nBins > 7: step = nBins / 6 locs = np.arange(0, nBins - 1, step) labs = [ "%d-%d" % (tmp_empiricalBNS[i], tmp_empiricalBNS[i + 1]) for i in locs ] plt.xticks(locs, labs, size=14, rotation=35, ha="right") plt.yticks(size=14) plt.xlabel(r"Size - $s$", size=18) plt.ylabel(r"$P(s)$", size=18) plt.legend(fontsize=16) plt.tight_layout() plt.savefig("figures/synPop_%s_wpKindSize.pdf" % (referenceName, ), bbox_inches="tight") plt.close() # Occupation of agents... # Compute the expected values... # Procedure: # - expected school = sum_age attendance_rate_age * pop_age # - expected employ = sum_age edu_age * employ|edu_age * pop_age # - expected unemply = N - expected school - expected employ NperWpKind = {k: .0 for k in minimumPopulationPerWPkind.keys()} home_nuts = [geoDFid2nuts[hhs[ag["hh"]]["l0"]] for ag in ags] for NUTS_code in selectedNUTS: tmp_nuts = NUTS_code while tmp_nuts not in educationLevelByAge_CDF.index: tmp_nuts = tmp_nuts[:-1] edu_levl_cdf = getEducationLevelCDF( educationLevelByAge_CDF["2011"].loc[tmp_nuts]) edu_rate_age = getEducationRate( schoolAttendanceRate_df["2013"].loc[tmp_nuts]) emp_rate_edu = getEmploymentProba( employmentBySexAgeEdu_df["2011"].loc[tmp_nuts]) for agent, tmp_home_nuts in zip(ags, home_nuts): if not tmp_home_nuts.startswith(NUTS_code): continue age, sex = agent["age"], agent["sex"] # Going to school tmp_edu_rate = edu_rate_age[age, sex] if age < 25: NperWpKind[age2schoolKind(age)] += tmp_edu_rate # Work tmp_notedu_rate = 1. - tmp_edu_rate pdf = np.concatenate((np.array([edu_levl_cdf[age, sex][0]]), np.diff(edu_levl_cdf[age, sex], axis=0))) assert .0 <= tmp_edu_rate <= 1. assert .9995 < pdf.sum() < 1.0005 for edu, frac in enumerate(pdf): NperWpKind[10] += frac * emp_rate_edu[age, sex, edu] * tmp_notedu_rate NperWpKind[-1] = ags.shape[0] - sum(NperWpKind.values()) NperWpKind[4] = NperWpKind.pop(10) # the generated values... bins = np.array([-1.5, -.5, .5, 1.5, 2.5, 3.5, 4.5]) vals = np.array(ags["employed"]) vals[vals == 10] = 4 plt.hist(vals, bins=bins, rwidth=.75, color="C1", lw=2, label="Generated") Xs = sorted(NperWpKind.keys()) Ys = [NperWpKind[k] for k in Xs] plt.plot(Xs, Ys, "o-C0", label="Actual data", lw=2, ms=14) loc2label = { -1: "Unemployed", 0: "Kindergarten", 1: "Primary Sc.", 2: "Secondary Sc.", 3: "University", 4: "Employed", } plt.ylabel("Number of agents", size=18) plt.xticks(sorted(loc2label.keys()), [v for k, v in sorted(loc2label.iteritems())], size=16, rotation=45, ha="right") plt.yticks(size=16) plt.legend(fontsize=16, loc="upper left", bbox_to_anchor=[.25, .95]) plt.tight_layout() plt.savefig("figures/synPop_%s_03_wpKindEmployed.pdf" % (referenceName, ), bbox_inches="tight") plt.close() ############## # Households # ############## # Plot the location of households xs = hhs["lon"] ys = hhs["lat"] fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) ax.set_aspect('equal') res = plt.hexbin(xs, ys, cmap=plt.cm.Blues, mincnt=1, norm=matplotlib.colors.LogNorm()) cbar = plt.colorbar(shrink=.65) cbar.set_label("Number of households", size=22) cbar.ax.tick_params(labelsize=16) geoDataFrame[2].plot(ax=ax, color="none", edgecolor="green", linestyle="--", lw=.5, alpha=.8) geoDataFrame[1].plot(ax=ax, color="none", edgecolor="blue", linestyle="-", lw=.5, alpha=.7) geoDataFrame[0].plot(ax=ax, color="none", edgecolor="black") dx = xs.max() - xs.min() dy = ys.max() - ys.min() max_dd = max(dx, dy) plt.xlim(xs.min() - .25, xs.min() + max_dd + .25) plt.ylim(ys.min() - .25, ys.min() + max_dd + .25) plt.xlabel("lon", size=22) plt.ylabel("lat", size=22) plt.xticks(size=16) plt.yticks(size=16) plt.tight_layout() plt.savefig("figures/synPop_%s_04_hhSpatialDistribution.pdf" % (referenceName, ), bbox_inches="tight") plt.close() # Clustering from collections import Counter from shapely.geometry import Polygon levelsTargetSize = cfg["levelsTargetSize"] NlevelsTargetSize = len(levelsTargetSize) Ntot_level = NlevelsTargetSize levelSizes = { tmp_l: Counter([ tuple(hh["l%d" % l] for l in xrange(3 + tmp_l + 1)) for hh in hhs ]) for tmp_l in range(Ntot_level) } codes = Counter([(h0, h1, h2) for h0, h1, h2 in zip(hhs["l0"], hhs["l1"], hhs["l2"])]) most_common = codes.most_common(10)[0][0] most_common_shape = reference_gdf[reference_gdf.code == most_common].geometry print "Most common code:", most_common tmp_hhs = hhs[hhs["l0"] == most_common[0]] tmp_hhs = tmp_hhs[tmp_hhs["l1"] == most_common[1]] tmp_hhs = tmp_hhs[tmp_hhs["l2"] == most_common[2]] plt.figure(figsize=(13, 4)) minx, maxx = min(tmp_hhs["lon"]), max(tmp_hhs["lon"]) miny, maxy = min(tmp_hhs["lat"]), max(tmp_hhs["lat"]) ddx, ddy = maxx - minx, maxy - miny step_dx = 10.**np.floor(np.log10(ddx / 4.)) step_dy = 10.**np.floor(np.log10(ddx / 4.)) minx, maxx = minx - step_dx / 5., maxx + step_dx / 5. miny, maxy = miny - step_dy / 5., maxy + step_dy / 5. bounds_to_plot = reference_gdf[reference_gdf.intersects( Polygon([[minx, miny], [minx, maxy], [maxx, maxy], [maxx, miny], [minx, miny]]))] for l in range(NlevelsTargetSize): ax = plt.subplot(1, NlevelsTargetSize, l + 1) ax.set_aspect('equal') plt.title("Level %d - size %d" % (3 + l, levelsTargetSize[l]), size=15) plt.scatter(tmp_hhs["lon"], tmp_hhs["lat"], c=tmp_hhs["l%d" % (l + 3)]) bounds_to_plot.plot(ax=ax, color="none", edgecolor="black", linestyle="-", lw=.5, alpha=.8) plt.xticks(np.arange(minx - minx % step_dx, maxx, step_dx), size=12, rotation=45, ha="right") plt.yticks(np.arange(miny - miny % step_dy, maxy, step_dy), size=12) plt.xlim(minx, maxx) plt.ylim(miny, maxy) plt.xlabel("lon", size=14) plt.ylabel("lat", size=14) plt.tight_layout() plt.savefig("figures/synPop_%s_05_clusteringLocal.pdf" % (cfg["referenceName"]), bbox_inches="tight") plt.close() # Size per level... avgHHsize = hhs["size"].mean() plt.figure(figsize=(4.5 * NlevelsTargetSize, 4)) for levelID, levelSize in enumerate(levelsTargetSize): ax = plt.subplot(1, NlevelsTargetSize, levelID + 1) ax.set_title("Level %d - Size %d" % (levelID + 3, levelSize), size=18) Xs = np.array(levelSizes[levelID].values(), dtype=np.float64) Xs *= avgHHsize r = plt.hist(Xs, rwidth=.95, label="Generated", color="C1") ySpan = [0, max(r[0]) * 1.15] plt.plot([levelSize] * 2, ySpan, "--C0", lw=6, label="Target") plt.xlabel(r"Cluster Size - $s$", size=18) plt.ylabel(r"$P(s)$", size=18) plt.xticks(size=14, rotation=45, ha="right") plt.ylim(ySpan) plt.legend(fontsize=14, loc="best") plt.tight_layout() plt.savefig("figures/synPop_%s_06_clusteringSizePerLeveL.pdf" % (cfg["referenceName"]), bbox_inches="tight") plt.close() # ## Household structure houseHolds = hhs # Save the relevant statistics for this area... # Compute the statistics for the new area... # The overall age distribution sexAgeCDF_array = np.column_stack((ageBySex_CDF[sex].loc[NUTS3code] for sex in ["male", "female", "total"])) sexAgePDF_array = np.column_stack((ageBySex_PDF[sex].loc[NUTS3code] for sex in ["male", "female", "total"])) # The household type distribution houseHoldTypeCDF = np.array(householdKind_CDF.loc[NUTS3code]) houseHoldTypePDF = np.array(householdKind_PDF.loc[NUTS3code]) # The size distribution for each household type distribution houseHoldType_sizeCDF = np.array( [householdSizeByKind_CDF[k].loc[NUTS3code] for k in householdLabels]) houseHoldType_sizePDF = np.array( [householdSizeByKind_PDF[k].loc[NUTS3code] for k in householdLabels]) # The age distribution for male and female for parents and children of each household # type agePDFparentSonHHtype = {} ageCDFparentSonHHtype = {} ageRAWparentSonHHtype = {} for hhKind in ageHouseholdLabels: agePDFparentSonHHtype[hhKind] = np.column_stack(( ageByHHrole_PDF[("male", hhKind)].loc[NUTS3code], ageByHHrole_PDF[("female", hhKind)].loc[NUTS3code], ageByHHrole_PDF[("total", hhKind)].loc[NUTS3code], )) ageCDFparentSonHHtype[hhKind] = np.column_stack( (ageByHHrole_CDF[("male", hhKind)].loc[NUTS3code], ageByHHrole_CDF[("female", hhKind)].loc[NUTS3code], ageByHHrole_CDF[("total", hhKind)].loc[NUTS3code])) # The raw numbers ageRAWparentSonHHtype[hhKind] = np.column_stack( (ageByHHrole_RAW[("male", hhKind)].loc[NUTS3code], ageByHHrole_RAW[("female", hhKind)].loc[NUTS3code])) # Put it in a row and divide by sum ageRAWparentSonHHtype[hhKind] = ageRAWparentSonHHtype[hhKind].flatten( order="C") ageRAWparentSonHHtype[hhKind] /= max( 1., ageRAWparentSonHHtype[hhKind].sum()) # Plot the age distribution for the males and females given their role and household status. # Check that we correctly translated the eurostat weights into probabilities. nToPlot = len(ageRAWparentSonHHtype) plt.figure(figsize=(4 * nToPlot, 4)) for iii, selectedHH in enumerate(ageRAWparentSonHHtype): plt.subplot(1, nToPlot, iii + 1) plt.title(selectedHH) # Since in the raw data we are normalizing over male+female here we have to "de-normalize" # the PDF of the original distribution by the weights of the male/female part. maleWeight = ageRAWparentSonHHtype[selectedHH][::2].sum() femaleWeight = ageRAWparentSonHHtype[selectedHH][1::2].sum() plt.plot(np.arange(0, 101), ageRAWparentSonHHtype[selectedHH][::2], "oC1", label="male RAW", lw=2) plt.plot(np.arange(0, 101), ageRAWparentSonHHtype[selectedHH][1::2], "^C0", label="female RAW", lw=2) plt.plot(np.arange(0, 101, 1.), agePDFparentSonHHtype[selectedHH][:, 0] * maleWeight, "--C3", label="male PDF", lw=2) plt.plot(np.arange(0, 101, 1.), agePDFparentSonHHtype[selectedHH][:, 1] * femaleWeight, "--C9", label="female PDF", lw=2) plt.xlabel(r"Age - $a$", size=15) plt.ylabel(r"$P(a)$", size=15) plt.xticks(size=12) plt.yticks(size=12) plt.legend(fontsize=12) plt.tight_layout() #plt.savefig("figures/synPop_%s_rawVsDerivedAgePDF.pdf" % (referenceName,), # bbox_inches="tight") plt.close() # In[ ]: # Household type frequency plt.figure(figsize=(5, 4)) bins = np.arange(-.5, 7.5, 1) plt.hist(hhs["kind"], bins=bins, density=True, rwidth=.75, label="Generated", color="C1") plt.plot(np.arange(len(houseHoldTypePDF)), houseHoldTypePDF, "o-C0", ms=14, lw=2, label="Actual data") locs = [h["id"] for k, h in sorted(houseHoldTypeDict.iteritems())] labs = [k for k, h in sorted(houseHoldTypeDict.iteritems())] plt.xticks(locs, labs, size=16, rotation=45, ha="right") plt.yticks(size=16) plt.xlim(-.75, bins[-1]) plt.legend(fontsize=12, loc="upper left", bbox_to_anchor=[.775, .975]) plt.xlabel(r"Household type - $h$", size=18) plt.ylabel(r"$P(h)$", size=18) plt.tight_layout() plt.savefig("figures/synPop_%s_07_hhKindDistribution.pdf" % (referenceName), bbox_inches="tight") plt.close() # Household size frequency per hh type nHouseholdKinds = len(houseHoldTypeDict) nCols = 3 nRows = nHouseholdKinds // nCols fig, ax = plt.subplots(nrows=nRows, ncols=nCols, sharex=True, figsize=(4 * nCols, 3 * nCols)) for i, hhName in enumerate(houseHoldTypeDict): plt.subplot(nRows, nCols, (i / nCols) * nCols + i % nCols + 1) plt.title(hhName, size=18) selectedHHtype = i plt.hist(hhs[hhs["kind"] == selectedHHtype]["size"], bins=np.arange(-.5, 12.5, 1.), density=True, label="Generated", rwidth=.75, color="C1") plt.plot(np.arange(len(houseHoldType_sizeCDF[selectedHHtype])) + 1., houseHoldType_sizePDF[selectedHHtype], "o-C0", label="Actual data", ms=14, lw=2) plt.xticks(range(0, 12), size=16) plt.yticks(size=16) plt.xlim(.25, 11.75) plt.legend(fontsize=16, loc="best") fig.text(.5, -.02, r"Members - $m$", size=22, ha="center") fig.text(-.02, .5, r"$P(m)$", size=22, va="center", rotation="vertical") plt.tight_layout() plt.savefig("figures/synPop_%s_08_hhSizePerKind.pdf" % (referenceName), bbox_inches="tight") plt.close() # Overall age in the whole population fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(15, 4)) binwidth = 2 plotCount = 1 for selectedSexes, sexName in zip([[0], [1], [0, 1]], ["Male", "Female", "Total"]): plt.subplot(1, 3, plotCount) plt.title(sexName, size=16) plt.hist(ags["age"], bins=np.arange(0, 103, binwidth), normed=True, label="Generated", rwidth=.9, color="C1") plt.plot(np.arange(len(sexAgePDF_array[:, 0])), sexAgePDF_array[:, 0], "o-C0", label="Actual data", lw=3) plt.xticks(size=14) plt.yticks(size=14) plt.xlabel(r"Age - $a$", size=16) if plotCount == 1: plt.ylabel(r"$P(a)$", size=16) plotCount += 1 plt.legend(fontsize=16, loc="upper left", bbox_to_anchor=[.7, 1.]) plt.tight_layout() plt.savefig("figures/synPop_%s_09_agePopulationPerSex.pdf" % (referenceName, ), bbox_inches="tight") plt.close() # Age distribution for household kind and role covered by person. dictRole = {0: "Children", 1: "Parent"} dictSex = {0: "male", 1: "female"} ################### ncols = len(houseHoldTypeDict) nrows = len(dictSex) * len(dictRole) fig, ax = plt.subplots(ncols=ncols, nrows=nrows, figsize=(4 * ncols, 4 * nrows)) iii = 0 for selectedRole, roleLabel in dictRole.iteritems(): for selectedSex, sexLabel in dictSex.iteritems(): for householdType, houseHoldData in houseHoldTypeDict.iteritems(): iii += 1 plt.subplot(nrows, ncols, iii) plt.title(householdType, size=20) plt.xticks(size=16) plt.yticks(size=16) if iii % ncols == 1: plt.ylabel(r"$P(a)$", size=18) if (iii - 1) // ncols >= nrows - 1: plt.xlabel(r"Age - $a$", size=18) plt.xlim(-.5, 104) selectedHHkind = houseHoldData["id"] ages = np.array([ ag["age"] for ag in ags[(ags["sex"] == selectedSex) & (ags["role"] == selectedRole)] if hhs[ag["hh"]]["kind"] == houseHoldData["id"] ]) if len(ages) < 2: continue lGen = plt.hist(ages, bins=np.arange(0, 102, 2), normed=True, rwidth=.9, label="Generated", color="C1") roleSexHH2label = { (1, 0, "M1_CH"): "A1_XCH", (1, 1, "F1_CH"): "A1_XCH", (1, 0, "MULTI_HH"): "A1_HH", (1, 1, "MULTI_HH"): "A1_HH", (1, 0, "CPL_WCH"): "CPL_XCH", (1, 1, "CPL_WCH"): "CPL_XCH", (1, 0, "A1_HH"): "A1_HH", (1, 1, "A1_HH"): "A1_HH", (1, 0, "CPL_NCH"): "CPL_XCH", (1, 1, "CPL_NCH"): "CPL_XCH", (0, 0, "M1_CH"): "CH_PAR", (0, 1, "M1_CH"): "CH_PAR", (0, 0, "F1_CH"): "CH_PAR", (0, 1, "F1_CH"): "CH_PAR", (0, 0, "MULTI_HH"): "CH_PAR", (0, 1, "MULTI_HH"): "CH_PAR", (0, 0, "CPL_WCH"): "CH_PAR", (0, 1, "CPL_WCH"): "CH_PAR", } try: lEmp = plt.plot(agePDFparentSonHHtype[roleSexHH2label[( selectedRole, selectedSex, householdType)]][:, selectedSex], "o-C0", label="Actual data", lw=3) except: pass fig.text(.5, .9995 - selectedRole * .5 - selectedSex * .25, roleLabel + " - " + sexLabel, size=22, ha="center") from matplotlib.patches import mlines empPatch = mlines.Line2D([], [], linestyle="", marker="s", markersize=10, color="C0", label="Actual data") genPatch = mlines.Line2D([], [], linestyle="", marker="s", markersize=10, color="C1", label="Generated") fig.legend(handles=[empPatch, genPatch], fontsize=20, loc="upper left", bbox_to_anchor=[.9, 1.065]) plt.tight_layout(h_pad=4.) plt.savefig("figures/synPop_%s_10_agePerRole.pdf" % (referenceName, ), bbox_inches="tight") plt.close()
def _do_average(subject, side): # parameters rootdir = 'Z:\\Userdata_Vicon_Server\\CP-projekti' plotdir = "Z:\\CP_projekti_analyysit\\Normal_vs_cognitive" max_files = None # limit c3d files read (for debug) max_dist = 15 # deg, for outlier detection # special layout lout = [['HipAnglesX', 'KneeAnglesX', 'AnkleAnglesX'], ['PelvisAnglesX', 'PelvisAnglesY', 'PelvisAnglesZ'], ['ThoraxAnglesX', 'ThoraxAnglesY', 'ThoraxAnglesZ'], ['ShoulderAnglesX', 'ShoulderAnglesY', 'ShoulderAnglesZ']] # add side for i, row in enumerate(lout): for j, item in enumerate(row): lout[i][j] = side + item # flatten into list lout_ = [item for row in lout for item in row] # try to auto find data dirs under subject dir subjdir = op.join(rootdir, subject) datadirs = [ file for file in os.listdir(subjdir) if op.isdir(op.join(subjdir, file)) ] if len(datadirs) > 1: raise Exception('Multiple data dirs under subject') datadir = datadirs[0] # collect normal walk trials N_files = op.join(subjdir, datadir, '*N?_*.c3d') Nfiles = glob.glob(N_files) Nfiles = Nfiles[:max_files] if max_files is not None else Nfiles # collect cognitive trials C_files = op.join(subjdir, datadir, '*C?_*.c3d') Cfiles = glob.glob(C_files) Cfiles = Cfiles[:max_files] if max_files is not None else Cfiles if not (Cfiles and Nfiles): raise Exception('Not enough trials') # average over trials models = gaitutils.models.models_all[:2] # PiG lower and upper Cavgdata, Cstddata, C_ok, Ccyc = gaitutils.stats.average_trials( Cfiles, models, max_dist=max_dist) Navgdata, Nstddata, N_ok, Ncyc = gaitutils.stats.average_trials( Nfiles, models, max_dist=max_dist) Ntr = gaitutils.trial.AvgTrial(Navgdata) Ctr = gaitutils.trial.AvgTrial(Cavgdata) # plot all pl = gaitutils.Plotter() pl.layout = lout pl.trial = Ntr cfg['plot']['model_stddev_alpha'] = '0.2' cfg['plot']['model_stddev_colors'] = "{'R': 'blue', 'L': 'blue'}" cfg['plot']['model_tracecolors'] = "{'R': 'blue', 'L': 'blue'}" pl.plot_trial(plot_model_normaldata=False, model_stddev=Nstddata) cfg['plot']['model_stddev_colors'] = "{'R': 'red', 'L': 'red'}" cfg['plot']['model_tracecolors'] = "{'R': 'red', 'L': 'red'}" pl.trial = Ctr maintitle = '%s normal vs cognitive (%s)\n' % (subject, side) maintitle += 'N_cycles normal: %d, cognitive: %d' % (Ncyc[side], Ccyc[side]) pl.plot_trial(plot_model_normaldata=False, model_stddev=Cstddata, show=True, superpose=True, maintitle=maintitle) # create custom legend outside axes from matplotlib.patches import mlines l_norm = mlines.Line2D([], [], color='blue') l_cogn = mlines.Line2D([], [], color='red') plt.legend([l_norm, l_cogn], ['normal', 'cognitive'], bbox_to_anchor=(.98, .98), bbox_transform=plt.gcf().transFigure, fontsize=8) # create pdf and png figs figname = '%s_%s' % (subject, side) figname = op.join(plotdir, figname) plt.savefig(figname + '.pdf') plt.savefig(figname + '.png') logname = figname + '.log' # report N of cycles per var print('\n%s: %s' % (subject, side)) print('N of normal cycles per variable:') print({key: var for key, var in N_ok.items() if key in lout_}) print('N of cogn. cycles per variable:') print({key: var for key, var in C_ok.items() if key in lout_}) # ...also into logfile with open(logname, 'w') as f: print('\n%s: %s' % (subject, side), file=f) print('N of normal cycles per variable:', file=f) print({key: var for key, var in N_ok.items() if key in lout_}, file=f) print('N of cogn. cycles per variable:', file=f) print({key: var for key, var in C_ok.items() if key in lout_}, file=f)