def execute(context): data = context.stage("data") variables = max(data.keys()) + 1 means = [np.mean(data[v] / data[0]) for v in range(variables)] #mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)] #maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)] mins = [np.min(data[v] / data[0]) for v in range(variables)] maxs = [np.max(data[v] / data[0]) for v in range(variables)] # Prepare plot plotting.setup() plt.figure() plt.bar(range(variables), means, color=plotting.COLORS["synthetic"]) for v, min, max in zip(range(variables), mins, maxs): plt.plot([ v, v, ], [min, max], linewidth=1, label="90% Conf.", color="k") plt.xlabel("Variables") plt.ylabel("Matching rate") plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x, ))) plt.tight_layout() plt.savefig("%s/matching_rate.pdf" % context.path())
def execute(context): data = context.stage("data") variables = max(data.keys()) + 1 indices = np.random.randint(0, len(data[0]), size=ESTIMATION_SAMPLES) means = [ np.mean(data[v][indices] / data[0][indices]) for v in range(variables) ] q10s = [ np.percentile(data[v][indices] / data[0][indices], 10) for v in range(variables) ] q90s = [ np.percentile(data[v][indices] / data[0][indices], 90) for v in range(variables) ] # Prepare plot plotting.setup() plt.figure() plt.bar(range(variables), means, color=plotting.COLORS["synthetic"]) for v, q10, q90 in zip(range(variables), q10s, q90s): plt.plot([ v, v, ], [q10, q90], linewidth=1, label="90% Conf.", color="k") plt.xlabel("Variables") plt.ylabel("Matching rate") plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x, ))) plt.tight_layout() plt.savefig("%s/matching_rate.pdf" % context.path())
def execute(context): plotting.setup() q = 0.01 plt.figure(figsize = plotting.WIDE_FIGSIZE) for s, color in zip([0.01, 0.1, 0.25], ["#000000", "#777777", "#cccccc"]): ws = np.linspace(0, 2000, 10000) probs = get_error_probability(ws, s, q) plt.plot(ws, probs, ".", label = "s = %.2f" % s, color = color, markersize = 2) plt.legend(loc = "best") plt.grid() plt.xlabel("Reference weight") plt.ylabel("Probability") plt.tight_layout() plt.savefig("%s/sampling_error.pdf" % context.path())
def execute(context): plotting.setup() hts_comparison = context.stage("data.hts.comparison") # Distance distribution plot df_distance = hts_comparison["distance_distribution"] f_entd = df_distance["hts"] == "entd" f_egt = df_distance["hts"] == "egt" plt.figure() plt.bar(df_distance[f_entd]["distance_class"].values, df_distance[f_entd]["trip_weight"].values / 1e6, width = 0.4, label = "ENTD (Routed)", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white") plt.bar(df_distance[f_egt]["distance_class"].values + 0.4, df_distance[f_egt]["trip_weight"].values / 1e6, width = 0.4, label = "EGT (Euclidean)", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white") plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(0, 10, 2) + 0.4)) plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)])) plt.gca().annotate( r"≥10 km", xy = (10.0, 8.0), xycoords = 'data', ha = "right" ) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha = 0.0) plt.xlabel("Trip distance") plt.ylabel("Number of trips [$10^6$]") plt.legend() plt.tight_layout() plt.savefig("%s/distance_distribution.pdf" % context.path()) plt.close() # HTS Age distribution plot df_age = hts_comparison["age_distribution"] f_entd = df_age["hts"] == "entd" f_egt = df_age["hts"] == "egt" f_census = df_age["hts"] == "census" plt.figure() plt.bar(df_age[f_census]["age_class"].values, df_age[f_census]["person_weight"].values / 1e6, width = 0.25, label = "Census", align = "edge", color = plotting.COLORS["census"], linewidth = 0.5, edgecolor = "white") plt.bar(df_age[f_entd]["age_class"].values + 0.25, df_age[f_entd]["person_weight"].values / 1e6, width = 0.25, label = "ENTD", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white") plt.bar(df_age[f_egt]["age_class"].values + 0.5, df_age[f_egt]["person_weight"].values / 1e6, width = 0.25, label = "EGT", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white") plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1000) + 0.75 / 2)) plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)])) AGE_BOUNDS = ["<15", "15-29", "30-44", "45-59", "60-74", ">75"] plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(AGE_BOUNDS)) plt.gca().annotate( "A", xy = (1.5 + 0.5 * 0.25, 2.0), xycoords='data', xytext = (1.5 + 0.5 * 0.25, 2.35), textcoords='data', arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 }, bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) }, ha = 'center' ) plt.gca().annotate( "B", xy = (4.25 + 0.5 * 0.25, 1.3), xycoords='data', xytext = (4.25 + 0.5 * 0.25, 1.65), textcoords='data', arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 }, bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) }, ha = 'center' ) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha = 0.0) plt.xlabel("Age") plt.ylabel("Number of persons [x$10^6$]") plt.legend() plt.tight_layout() plt.savefig("%s/age_distribution.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() hts_name = context.config("hts") df_census = context.stage("census") df_hts, df_correction = context.stage("hts") # PLOT: Work / education flows plt.figure(figsize=plotting.WIDE_FIGSIZE) figures = [{ "slot": "work", "title": "Work", "top": 12 }, { "slot": "education", "title": "Education", "top": 12, "factor": 0.7 }] for index, figure in enumerate(figures): plt.subplot(1, 2, index + 1) slot = figure["slot"] df = context.stage("data")[slot] df = pd.merge(df, df_census[slot].rename(columns={"weight": "reference"}), on=["home", slot]) df = pd.merge(df, df_correction[slot], on="home") df["scaled_reference"] = df["reference"] * ( figure["factor"] if "factor" in figure else df["factor"]) count = figure["top"] df = df.sort_values(by="scaled_reference", ascending=False).head(count) plt.bar(np.arange(count), df["reference"], width=0.4, align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["census"], alpha=0.25) plt.bar(np.arange(count), df["scaled_reference"], width=0.4, label="Census", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["census"]) plt.bar(np.arange(count) + 0.4, df["mean"] / SAMPLING_RATE, width=0.4, label="Synthetic", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) for index, (q5, q95) in enumerate(zip(df["q5"].values, df["q95"].values)): index += 0.4 + 0.2 plt.plot([index, index], [q5 / SAMPLING_RATE, q95 / SAMPLING_RATE], color='k', linewidth=1.0) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha=0.0) plt.gca().yaxis.set_major_locator( tck.FixedLocator(np.arange(100) * 1e5)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, ))) origins, destinations = df["home"].values, df[figure["slot"]].values plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(count) + 0.4)) plt.gca().xaxis.set_major_formatter( tck.FixedFormatter( ["%s\n%s" % item for item in zip(origins, destinations)])) plt.ylabel("Commuters [x1000]") plt.legend(loc="best") plt.title(figure["title"]) plt.tight_layout() plt.savefig("%s/commute_flows.pdf" % context.path()) plt.close() # PLOT: Scatter plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [{ "slot": "work", "title": "Work", "marker": ".", "color": "k" }, { "slot": "education", "title": "Education", "factor": 0.7, "marker": ".", "color": plotting.COLORS["egt"] }] minimum = np.inf maximum = -np.inf for part in parts: slot = part["slot"] df = context.stage("data")[slot] df = pd.merge(df, df_census[slot].rename(columns={"weight": "reference"}), on=["home", slot]) df = pd.merge(df, df_correction[slot], on="home") df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"]) plt.loglog(df["scaled_reference"], df["mean"] / SAMPLING_RATE, markersize=2, marker=part["marker"], color=part["color"], linestyle="none", label=part["title"]) minimum = min(minimum, df["scaled_reference"].min() * 0.9) maximum = max(maximum, df["scaled_reference"].max() * 1.1) x = np.linspace(minimum, maximum, 100) plt.fill_between(x, x * 0.8, x * 1.2, color="k", alpha=0.2, linewidth=0.0, label=r"20% Error") plt.xlim([minimum, maximum]) plt.ylim([minimum, maximum]) plt.grid() plt.gca().set_axisbelow(True) plt.legend() plt.xlabel("Reference flow") plt.ylabel("Synthetic flow") plt.tight_layout() plt.savefig("%s/commute_scatter.pdf" % context.path()) plt.close() # PLOT: Histogram plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [{ "slot": "work", "title": "Work" }, { "slot": "education", "title": "Education", "factor": 0.7 }] for index, part in enumerate(parts): slot = part["slot"] df = context.stage("data")[slot] df = pd.merge(df, df_census[slot].rename(columns={"weight": "reference"}), on=["home", slot]) df = pd.merge(df, df_correction[slot], on="home") df["scaled_reference"] = df["reference"] * (part["factor"] if "factor" in part else df["factor"]) df["difference"] = 100 * ( df["mean"] / SAMPLING_RATE - df["scaled_reference"]) / df["scaled_reference"] q5 = df["difference"].quantile(0.05) q95 = df["difference"].quantile(0.95) mean = df["difference"].mean() values = df["difference"].values outliers = values # values[(values < q5) | (values > q95)] plt.plot([index - 0.2, index + 0.2], [q5, q5], color="k", linewidth=1.0) plt.plot([index - 0.2, index + 0.2], [q95, q95], color="k", linewidth=1.0) plt.plot([index - 0.2, index + 0.2], [mean, mean], color="k", linewidth=1.0, linestyle=":") plt.plot([index - 0.2, index - 0.2], [q5, q95], color="k", linewidth=1.0) plt.plot([index + 0.2, index + 0.2], [q5, q95], color="k", linewidth=1.0) plt.plot([index] * len(outliers), outliers, color="k", marker=".", markersize=2, linestyle="none") plt.gca().xaxis.set_major_locator(tck.FixedLocator([0, 1])) plt.gca().xaxis.set_major_formatter( tck.FixedFormatter(["Work", "Education"])) plt.ylabel("Error [%]") plt.xlim([-0.5, 1.5]) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha=0.0) plt.bar([np.nan], [np.nan], color="none", edgecolor="k", linewidth=1.0, label="5% - 95%") plt.plot([np.nan], color="k", linestyle=":", label="Mean") plt.legend(loc="best") plt.tight_layout() plt.savefig("%s/commute_flow_boxplot.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() reference = context.stage("analysis.reference.hts.chains") data = context.stage("data") # PLOT: Activity chains by sex marginal = ("age_range", "sex", "chain") df = pd.merge(data[marginal], reference[marginal].rename(columns={"weight": "reference"})) df = df[df["age_range"]] df_female = df[df["sex"] == "female"].sort_values(by="reference", ascending=False).head(10) df_male = df[df["sex"] == "male"].sort_values(by="reference", ascending=False).head(10) plt.figure(figsize=plotting.WIDE_FIGSIZE) hts_name = context.config("hts") for index, (df, title) in enumerate( zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])): plt.subplot(1, 2, index + 1) plt.bar(np.arange(10), df["reference"], width=0.4, label="HTS", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS[hts_name]) plt.bar(np.arange(10) + 0.4, df["mean"] / SAMPLING_RATE, width=0.4, label="Synthetic", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) for location, (min, max) in enumerate( zip(df["min"].values, df["max"].values)): location += 0.4 + 0.2 plt.plot([location, location], [min / SAMPLING_RATE, max / SAMPLING_RATE], "k", linewidth=1) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha=0.0) if hts_name == "egt": plt.ylim([0, 3.5e5]) else: plt.ylim([0, 5e5]) plt.plot([np.nan], color="k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator( tck.FixedLocator(np.arange(100) * 1e5)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, ))) plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(10) + 0.4)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter( lambda x, p: "\n".join(df["chain"].values[p]).upper())) if index == 1: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000)) plt.gca().yaxis.get_label().set_visible(False) handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3]] labels = [labels[-2], labels[-1], labels[-3]] plt.legend(handles=handles, labels=labels, loc="best", title=title) if index == 0: plt.ylabel("Number of persons [x1000]") plt.tight_layout() plt.savefig("%s/activity_chains.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() hts_name = context.config("hts") # PLOT: Input distributions distributions = context.stage( "synthesis.population.spatial.secondary.distance_distributions") plt.figure() modes = list(context.stage("analysis.reference.hts.mode_distances").keys()) #modes = ["car", "car_passenger", "pt", "bike", "walk"] for index, mode in enumerate(modes): mode_distribution = distributions[mode] bounds = mode_distribution["bounds"] bounds[~np.isfinite(bounds)] = 6 * 3600 means = [0.0] q10 = [0.0] q90 = [0.0] for distribution in mode_distribution["distributions"]: weights = distribution["weights"] / np.sum(distribution["weights"]) means.append(np.sum(weights * distribution["values"])) q10.append(distribution["values"][np.count_nonzero( distribution["cdf"] < 0.1)]) q90.append(distribution["values"][np.count_nonzero( distribution["cdf"] < 0.9)]) if mode in ("car", "pt"): plt.fill_between([0.0] + list(bounds), q10, q90, color=plotting.COLORSET5[index], alpha=0.25, linewidth=0.0) plt.plot([0.0] + list(bounds), means, label="%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)), linewidth=1.0, marker=".", markersize=3, color=plotting.COLORSET5[index]) plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(100) * 60 * 20)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: str(x // 60))) plt.gca().yaxis.set_major_locator( tck.FixedLocator(np.arange(100) * 5 * 1000)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: str(x // 1000))) plt.legend(loc="upper left") plt.xlim([0, 90 * 60 if hts_name == "egt" else 50 * 60]) plt.ylim([0, 45 * 1000 if hts_name == "egt" else 25 * 1000]) plt.grid() plt.xlabel("Travel time [min]") plt.ylabel("Euclidean distance [km]") plt.tight_layout() plt.savefig("%s/input_distributions.pdf" % context.path()) plt.close() # PLOT: Distance distributions df_synthetic = context.stage("analysis.synthesis.mode_distances") reference_data = context.stage("analysis.reference.hts.mode_distances") plt.figure(figsize=(6.0, 2.5), dpi=100) # 2.5 * 2.5 limits = dict(car=20 * 1e3, car_passenger=20 * 1e3, pt=20 * 1e3, bike=6 * 1e3, walk=1 * 1e3) modes = ["car", "bike" if "bike" in modes else "walk"] for index, mode in enumerate(modes): plt.subplot(1, 2, index + 1) mode_reference = reference_data[mode] plt.plot(mode_reference["values"] * 1e-3, mode_reference["cdf"], linestyle='--', color="k", linewidth=1.0, label="HTS") df_mode = df_synthetic[df_synthetic["mode"] == mode] plt.fill_betweenx(df_mode["cdf"], df_mode["q5"] * 1e-3, df_mode["q95"] * 1e-3, linewidth=0.0, color=plotting.COLORS[hts_name], alpha=0.25, label="90% Conf.") plt.plot(df_mode["mean"] * 1e-3, df_mode["cdf"], color=plotting.COLORS[hts_name], linewidth=1.0, label="Synthetic") plt.xlim([0, limits[mode] * 1e-3]) plt.ylim([0, 1]) plt.title(plotting.MODE_LABELS[mode], fontsize=plotting.FONT_SIZE) plt.xlabel("Euclidean distance [km]") plt.grid() if index % 2 == 0: plt.ylabel("Cumulative density") if index % 2 == 1: plt.legend(loc="best") plt.tight_layout() plt.savefig("%s/distance_distributions.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() hts = context.stage("analysis.reference.hts.sociodemographics") census = context.stage("analysis.reference.census.sociodemographics") data = context.stage("data") figures = [ dict(level="person", label="Number of persons", size=(6.0, 5.0), marginals=[ "age_class", "sex", "employed", "studies", "has_license", "has_pt_subscription", "socioprofessional_class" ]), dict(level="household", label="Number of households", size=plotting.WIDE_FIGSIZE, marginals=[ "household_size_class", "number_of_vehicles_class", "number_of_bikes_class" ]) ] for figure in figures: plt.figure(figsize=figure["size"]) df_figure = prepare_data(data, hts, census, figure["level"], figure["marginals"], SAMPLING_RATE) reweight_hts(df_figure, hts, census, figure["level"]) add_labels(df_figure) locations = np.arange(len(df_figure)) f = (df_figure["reference_source"] == "census").values plt.barh(locations[f], df_figure["reference"].values[f], height=0.4, label="Census", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["census"]) plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height=0.4, label="Synthetic", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) f = (df_figure["reference_source"] == "hts").values hts_name = context.config("hts") plt.barh(locations[f], df_figure["reference"].values[f], height=0.4, label="HTS", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS[hts_name]) plt.barh(locations[f] + 0.4, df_figure["mean"].values[f], height=0.4, label=None, align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) for index, (min, max) in enumerate( zip(df_figure["min"].values, df_figure["max"].values)): location = index + 0.4 + 0.2 plt.plot([min, max], [location, location], "k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4)) plt.gca().yaxis.set_major_formatter( tck.FixedFormatter(df_figure["label"].values)) if figure["level"] == "person": plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(1, 100) * 1e6 * 2)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%dM" % (x / 1e6, ))) if figure["level"] == "household": plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%.1fM" % (x / 1e6, ))) plt.grid() plt.gca().set_axisbelow(True) plt.gca().yaxis.grid(alpha=0.0) plt.gca().invert_yaxis() plt.xlabel(figure["label"]) handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3], handles[-4]] labels = [labels[-2], labels[-1], labels[-3], labels[-4]] plt.legend(handles=handles, labels=labels, loc="best") plt.tight_layout() plt.savefig("%s/%s.pdf" % (context.path(), figure["level"])) plt.close()
def execute(context): plotting.setup() hts_data = context.stage("hts") data = context.stage("data") census_data = context.stage("census") plt.figure(figsize=plotting.SHORT_FIGSIZE) parts = [{ "slot": "work", "linestyle": "-", "title": "Work" }, { "slot": "education", "linestyle": "--", "title": "Educ." }] for part in parts: slot = part["slot"] #plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0) plt.plot(data[slot]["mean"], data[slot]["cdf"], color="k", linestyle=part["linestyle"], linewidth=1.0) plt.fill_betweenx(data[slot]["cdf"], data[slot]["q5"], data[slot]["q95"], color="k", linewidth=0.0, alpha=0.25) plt.plot(hts_data[slot]["euclidean_distance"] * 1e-3, hts_data[slot]["cdf"], color=plotting.COLORS["egt"], linestyle=part["linestyle"], linewidth=1.0) plt.plot([np.nan], color="k", linewidth=1.0, linestyle=part["linestyle"], label=part["title"]) plt.plot([np.nan], color="k", linewidth=1.0, label="Synthetic") plt.plot([np.nan], color=plotting.COLORS["egt"], linewidth=1.0, label="EGT") plt.xlim([0, 40]) plt.ylim([0, 1]) plt.legend(loc="best", ncol=2) plt.grid() plt.gca().set_axisbelow(True) plt.xlabel("Euclidean commute distance [km]") plt.ylabel("Cumulative density") plt.tight_layout() plt.savefig("%s/commute_distance_cdf.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() census = context.stage("analysis.reference.census.sociodemographics") data = context.stage("data") cases = [ dict(commune=75106, title="16th Arrondissement"), dict(commune=94002, title="Alfortville") ] plt.figure(figsize=plotting.WIDE_FIGSIZE) for case_index, case in enumerate(cases): case_census = filter_commune(census, case["commune"]) case_data = filter_commune(data, case["commune"]) df_case = pd.concat([ prepare_data(case_data, case_census, case_census, "household", ["household_size_class"], SAMPLING_RATE), prepare_data(case_data, case_census, case_census, "person", ["age_class"], SAMPLING_RATE), ]) add_labels(df_case) plt.subplot(1, 2, case_index + 1) locations = np.arange(len(df_case)) reference_values = df_case["reference"].values mean_values = df_case["mean"].values plt.barh(locations, df_case["reference"].values, height=0.4, label="Census", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["census"]) plt.barh(locations + 0.4, df_case["mean"].values, height=0.4, label="Synthetic", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) for index, (q5, q95) in enumerate( zip(df_case["q5"].values, df_case["q95"].values)): location = index + 0.4 + 0.2 plt.plot([q5, q95], [location, location], "k", linewidth=1, label="90% Conf.") plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4)) if case_index == 0: plt.gca().yaxis.set_major_formatter( tck.FixedFormatter(df_case["label"].values)) else: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100)) plt.grid() plt.gca().set_axisbelow(True) plt.gca().yaxis.grid(alpha=0.0) plt.gca().invert_yaxis() plt.xlabel("Number of persons / households") plt.title(case["title"]) if case_index == 0: handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3]] labels = [labels[-2], labels[-1], labels[-3]] plt.legend(handles=handles, labels=labels, loc="best") plt.tight_layout() plt.savefig("%s/comparison.pdf" % (context.path(), )) plt.close()
def execute(context): plotting.setup() census = context.stage("analysis.reference.census.sociodemographics") data = context.stage("data") cases = [ dict(commune=44109, title="Nantes Centre"), dict(commune=44158, title="Saint Etienne de Montluc"), ] plt.figure(figsize=plotting.WIDE_FIGSIZE) for case_index, case in enumerate(cases): case_census = filter_commune(census, case["commune"]) case_data = filter_commune(data, case["commune"]) df_case = pd.concat([ prepare_data(case_data, case_census, case_census, "household", ["household_size_class"], SAMPLING_RATE), prepare_data(case_data, case_census, case_census, "person", ["age_class"], SAMPLING_RATE), ]) add_labels(df_case) plt.subplot(1, 2, case_index + 1) locations = np.arange(len(df_case)) reference_values = df_case["reference"].values mean_values = df_case["mean"].values plt.barh(locations, df_case["reference"].values, height=0.4, label="Census", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["census"]) plt.barh(locations + 0.4, df_case["mean"].values, height=0.4, label="Synthetic", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["synthetic"]) for index, (min, max) in enumerate( zip(df_case["min"].values, df_case["max"].values)): location = index + 0.4 + 0.2 plt.plot([min, max], [location, location], "k", linewidth=1, label="Range") plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4)) if case_index == 0: plt.gca().yaxis.set_major_formatter( tck.FixedFormatter(df_case["label"].values)) else: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%dk" % (x // 1000, ))) plt.grid() plt.gca().set_axisbelow(True) plt.gca().yaxis.grid(alpha=0.0) plt.gca().invert_yaxis() plt.xlabel("Number of persons / households") plt.title(case["title"]) #plt.ylim([len(locations) + 2.5, -0.5]) if case_index == 1: handles, labels = plt.gca().get_legend_handles_labels() handles = [handles[-2], handles[-1], handles[-3]] labels = [labels[-2], labels[-1], labels[-3]] plt.legend(handles=handles, labels=labels, loc=(0.05, 0.32), framealpha=1.0) plt.tight_layout() plt.savefig("%s/comparison.pdf" % (context.path(), )) plt.close()
def execute(context): data = context.stage("analysis.synthesis.statistics.monte_carlo") # Prepare data for error probability table df_table = [] for marginal in TABLE_MARGINALS: df_marginal = data[(marginal, )] values = np.sort(df_marginal[(marginal, )].drop_duplicates().values) for value in values: row = {"marginal": marginal, "value": value} df_value = df_marginal[df_marginal[marginal] == value] df_value = df_value[df_value["samples"] == ACQUISITION_SAMPLE_SIZE] assert len(df_value) == len(SAMPLING_RATES) probabilities = df_value.sort_values( by=["sampling_rate", "samples"])["error_probability"].values[:, 0] for sampling_rate, probability in zip(SAMPLING_RATES, probabilities): row[sampling_rate] = probability df_table.append(row) df_table = pd.DataFrame.from_records(df_table) df_table = create_table(df_table) df_table.to_latex("%s/monte_carlo_table.tex" % context.path(), escape=False) # Prepare data for plotting reference = context.stage( "analysis.reference.census.sociodemographics")["person"] # Perform plotting plotting.setup() plt.figure(figsize=plotting.WIDE_FIGSIZE) # ... subplot on nominal stratum values plt.subplot(1, 2, 1) plt.title("(a) Monte Carlo analysis", fontsize=plotting.FONT_SIZE) df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, SELECTED_VALUES) assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES) display_sampling_rates = [0.001, 0.01, 0.05] for index, sampling_rate in enumerate([0.001, 0.01, 0.05]): df_rate = df_marginal[df_marginal["sampling_rate"] == sampling_rate] df_rate = df_rate.sort_values(by="samples") plt.fill_between(df_rate["samples"], df_rate[("weight", "q5")], df_rate[("weight", "q95")], alpha=0.25 + index * 0.2, color=plotting.COLORSET[0], linewidth=0.0) plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value] * 2, 'k--', label="Ref. $y$", linewidth=1.0) plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 0.99] * 2, 'k:', label="1% Err.", linewidth=1.0) plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2, 'k:', linewidth=1.0) plt.xlabel("Sample size $N$") plt.ylabel("Stratum weight") plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, ))) plt.grid() plt.gca().set_axisbelow(True) plt.xlim([1, ACQUISITION_SAMPLE_SIZE]) plt.fill_between([np.nan], [np.nan], [np.nan], color=plotting.COLORSET[0], alpha=0.25, label="90% Conf.") plt.legend(loc="lower center", ncol=2) # ... subplot on nominal stratum values plt.subplot(1, 2, 2) plt.title("(b) Error probability", fontsize=plotting.FONT_SIZE) for index, values in enumerate(ADDITIONAL_VALUES): df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL, values) assert len( df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES) df_max = df_marginal[df_marginal["samples"] == ACQUISITION_SAMPLE_SIZE] df_max = df_max.sort_values(by="sampling_rate") plt.plot(100 * np.array(SAMPLING_RATES), df_max[("error_probability", "mean")], color=plotting.COLORSET[index], label="Age %s" % ADDITIONAL_LABELS[index], marker=".", markersize=3.0, linewidth=1.0) plt.plot([0, 100 * max(SAMPLING_RATES)], [0.9] * 2, 'k:', label="90% Prob.", linewidth=1.0) plt.xlim([0, 100 * max(SAMPLING_RATES)]) plt.ylim([0, 1.0]) plt.xlabel("Sampling rate $s$ [%]") plt.ylabel("Error probability") plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="center", ncol=1) plt.tight_layout() plt.savefig("%s/monte_carlo.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() # Income imputation df_income = context.stage("data.income.municipality") df_imputed = df_income[df_income["is_imputed"]] plt.figure() minimum = min(df_imputed["reference_median"].min(), df_imputed["q5"].min()) * 1e-3 maximum = max(df_imputed["reference_median"].max(), df_imputed["q5"].max()) * 1e-3 plt.plot([minimum, maximum], [minimum, maximum], "k--") f = ~df_imputed["is_missing"] plt.plot(df_imputed[f]["reference_median"] * 1e-3, df_imputed[f]["q5"] * 1e-3, '.', markersize=3, color=plotting.COLORSET[0], label="y") plt.plot(df_imputed[~f]["reference_median"] * 1e-3, df_imputed[~f]["q5"] * 1e-3, 'x', markersize=3, color=plotting.COLORSET[1]) plt.xlabel("Reference median income [1000 EUR]") plt.ylabel("Imputed median income [1000 EUR]") plt.grid() plt.tight_layout() plt.savefig("%s/income_imputation.pdf" % context.path()) plt.close() # Income distributions plt.figure() df_data = context.stage("data") df_reference = context.stage("analysis.reference.income") f = df_reference["source"] == "entd" plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color=plotting.COLORS["entd"], label="ENTD", linewidth=1.0) f = df_reference["source"] == "egt" plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color=plotting.COLORS["egt"], label="EGT", linewidth=1.0) f = df_reference["source"] == "filo" plt.plot(df_reference[f]["income"].values * 1e-3, df_reference[f]["cdf"].values, color=plotting.COLORS["census"], label="Tax data", linewidth=1.0, marker=".", markersize=3) plt.plot(df_data["mean"].values * 1e-3, df_data["cdf"].values, color="k", label="Synthetic", linewidth=1.0, linestyle=":") plt.fill_betweenx(df_data["cdf"].values, df_data["min"].values * 1e-3, df_data["max"].values * 1e-3, color="k", linewidth=0.0, alpha=0.25) plt.xlim([0, 60]) plt.xlabel("Household income [1000 EUR]") plt.ylabel("Cumulative density") plt.legend(loc="lower right") plt.grid() plt.tight_layout() plt.savefig("%s/income_distributions.pdf" % context.path()) plt.close()
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather marginal information df_data = [] for sampling_rate in SAMPLING_RATES: df_marginals = [] for df_stage in bt.get_stages(context, "sample_%f" % sampling_rate, sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_stage = stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL] df_stage["sampling_rate"] = sampling_rate df_marginals.append(df_stage) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] df_data.append(df_marginals) df_data = pd.concat(df_data) sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sampling_rate in SAMPLING_RATES: for sample_size in context.progress( sample_sizes, label="Calculating sample sizes ..."): df_marginals = df_data[df_data["sampling_rate"] == sampling_rate] df_marginals = df_marginals.drop(columns=["sampling_rate"]) df_bootstrap = stats.bootstrap( df_marginals, ESTIMATION_SAMPLES, sample_size, metrics={ "mean": "mean", "q5": lambda x: x.quantile(0.05), "q95": lambda x: x.quantile(0.95), "precision": lambda x: np.mean( np.abs(x / sampling_rate - reference) / reference <= ERROR_THRESHOLD) }) df_bootstrap["sample_size"] = sample_size df_bootstrap["sampling_rate"] = sampling_rate df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) # Plotting plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) for index, sampling_rate in enumerate(SAMPLING_RATES): df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate] plt.plot(df_rate["sample_size"], df_rate["precision"], label=SAMPLING_RATE_LABELS[sampling_rate], color=SAMPLING_RATE_COLORS[sampling_rate]) plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:') plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.ylim([0, 1.05]) plt.xlabel("Number of seeds $K$") plt.ylabel(r"Error probability") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", title="Sampling rate $s$") plt.tight_layout() plt.savefig("%s/error_probability.pdf" % context.path()) plt.close()
def execute(context): plotting.setup() marginal = ("age_range", "sex", "chain") df_egt = context.stage("egt")[marginal].rename(columns={"weight": "egt"}) df_entd = context.stage("entd")[marginal].rename( columns={"weight": "entd"}) df = pd.merge(df_egt, df_entd, on=["age_range", "sex", "chain"]) df = df[df["age_range"]] df_female = df[df["sex"] == "female"].sort_values(by="egt", ascending=False).head(10) df_male = df[df["sex"] == "male"].sort_values(by="egt", ascending=False).head(10) plt.figure(figsize=plotting.WIDE_FIGSIZE) for index, (df, title) in enumerate( zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])): plt.subplot(1, 2, index + 1) plt.bar(np.arange(10), df["egt"], width=0.4, label="EGT", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["egt"]) plt.bar(np.arange(10) + 0.4, df["entd"], width=0.4, label="ENTD", align="edge", linewidth=0.5, edgecolor="white", color=plotting.COLORS["entd"]) plt.grid() plt.gca().set_axisbelow(True) plt.gca().xaxis.grid(alpha=0.0) plt.gca().yaxis.set_major_locator( tck.FixedLocator(np.arange(100) * 1e5)) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, ))) plt.gca().xaxis.set_major_locator( tck.FixedLocator(np.arange(10) + 0.4)) plt.gca().xaxis.set_major_formatter( tck.FuncFormatter( lambda x, p: "\n".join(df["chain"].values[p]).upper())) if index == 1: plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 1000)) plt.gca().yaxis.get_label().set_visible(False) plt.legend(loc="best", title=title) if index == 0: plt.ylabel("Number of persons [x1000]") plt.tight_layout() plt.show() plt.savefig("%s/activity_chains.pdf" % context.path()) plt.close()
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather information df_marginals = [] for df_stage in bt.get_stages(context, "sample", sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_marginals.append( stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL]) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sample_size in context.progress(sample_sizes, label="Calculating sample sizes ..."): df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES, sample_size) df_bootstrap["sample_size"] = sample_size df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) df_figure["mean"] /= SAMPLING_RATE df_figure["q5"] /= SAMPLING_RATE df_figure["q95"] /= SAMPLING_RATE # Prepare plot plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) plt.fill_between(df_figure["sample_size"], df_figure["q5"], df_figure["q95"], alpha=0.25, label="90% Conf.", color=plotting.COLORSET[0], linewidth=0.0) plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2, 'k--', label="Ref. $w$") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2, 'k:', label="1% Err.") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:') plt.plot(df_figure["sample_size"], df_figure["mean"], label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$", color=plotting.COLORSET[0]) plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.xlabel("Number of seeds $K$") plt.ylabel("Stratum weight") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", ncol=2) plt.tight_layout() plt.savefig("%s/sample_count.pdf" % context.path()) plt.close()