def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") feeder = zip( bs.get_stages(context, "synthesis.population.spatial.home.locations", acquisition_sample_size), bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size), bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), ) probabilities = np.linspace(0.0, 1.0, 20) quantiles = {"work": [], "education": []} with context.progress(label="Processing commute data ...", total=acquisition_sample_size) as progress: for df_home, df_spatial, df_persons in feeder: # Prepare home df_home = pd.merge(df_home, df_persons[["person_id", "household_id"]], on="household_id") df_home = df_home[["person_id", "geometry" ]].set_index("person_id").sort_index() assert len(df_home) == len(df_persons) for index, name in enumerate(("work", "education")): df_destination = df_spatial[index] df_destination = df_destination[["person_id", "geometry"]] df_destination = df_destination.set_index( "person_id").sort_index() df_compare = df_home.loc[df_destination.index] assert len(df_destination) == len(df_compare) distances = df_destination["geometry"].distance( df_compare["geometry"]) * 1e-3 quantiles[name].append( [distances.quantile(p) for p in probabilities]) progress.update() result = {} random = np.random.RandomState(0) for name in ("work", "education"): data = np.array(quantiles[name]) indices = np.random.randint(acquisition_sample_size, size=ESTIMATION_SAMPLE_SIZE) mean = np.mean(data[indices, :], axis=0) q5 = np.percentile(data[indices, :], 5, axis=0) q95 = np.percentile(data[indices, :], 95, axis=0) df = pd.DataFrame(dict(mean=mean, q5=q5, q95=q95, cdf=probabilities)) result[name] = df return result
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] feeder = zip( bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size), bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size) ) for df, df_home in feeder: df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]]) marginals.prepare_classes(df) person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None)) household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None)) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten) household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten) return dict(person = person_marginals, household = household_marginals)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") probabilities = np.linspace(0.0, 1.0, 20) modes = ["car", "car_passenger", "pt", "bike", "walk"] quantiles = { mode : [] for mode in modes } generator = zip( bs.get_stages(context, "synthesis.population.spatial.locations", acquisition_sample_size), bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size) ) with context.progress(label = "Processing distance data ...", total = acquisition_sample_size) as progress: for df_locations, df_trips in generator: # Load locations and calculate euclidean distances df_locations = df_locations[["person_id", "activity_index", "geometry"]].rename(columns = { "activity_index": "trip_index" }) df_locations["euclidean_distance"] = df_locations["geometry"].distance(df_locations["geometry"].shift(-1)) # Merge mode into distances df_trips = pd.merge( df_trips[["person_id", "trip_index", "mode", "preceding_purpose", "following_purpose", "departure_time", "arrival_time"]], df_locations, on = ["person_id", "trip_index"], how = "inner" ) df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"] # Filter trips primary_activities = ["home", "work", "education"] #primary_activities = [] df_trips = df_trips[~( df_trips["preceding_purpose"].isin(primary_activities) & df_trips["following_purpose"].isin(primary_activities) )] # Calculate quantiles for mode in modes: df_mode = df_trips[df_trips["mode"] == mode] quantiles[mode].append([df_mode["euclidean_distance"].quantile(p) for p in probabilities]) progress.update() for mode in modes: quantiles[mode] = np.array(quantiles[mode]) random = np.random.RandomState(0) df_data = [] for mode in modes: indices = np.random.randint(acquisition_sample_size, size = ESTIMATION_SAMPLE_SIZE) mean = np.mean(quantiles[mode][indices,:], axis = 0) q5 = np.percentile(quantiles[mode][indices,:], 5, axis = 0) q95 = np.percentile(quantiles[mode][indices,:], 95, axis = 0) df_data.append(pd.DataFrame(dict(mean = mean, q5 = q5, q95 = q95, cdf = probabilities))) df_data[-1]["mode"] = mode return pd.concat(df_data)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") feeder = zip( bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size), bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size), bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), ) work_flows = [] education_flows = [] with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: for sample, (df_home, df_spatial, df_persons) in enumerate(feeder): # Prepare home df_home = pd.merge(df_persons[["person_id", "household_id"]], df_home, on = "household_id") df_home = df_home[["person_id", "departement_id"]].rename(columns = { "departement_id": "home" }) # Prepare work df_work = df_spatial[0] df_work["departement_id"] = df_work["commune_id"] // 1000 df_work = df_work[["person_id", "departement_id"]].rename(columns = { "departement_id": "work" }) # Calculate work df_work = pd.merge(df_home, df_work, on = "person_id").groupby(["home", "work"]).size().reset_index(name = "weight") df_work = df_work.reset_index() df_work["sample"] = sample work_flows.append(df_work) # Prepare work df_education = df_spatial[1] df_education["departement_id"] = df_education["commune_id"] // 1000 df_education = df_education[["person_id", "departement_id"]].rename(columns = { "departement_id": "education" }) # Calculate education df_education = pd.merge(df_home, df_education, on = "person_id").groupby(["home", "education"]).size().reset_index(name = "weight") df_education = df_education.reset_index() df_education["sample"] = sample education_flows.append(df_education) progress.update() df_work = pd.concat(work_flows) df_education = pd.concat(education_flows) df_work = stats.bootstrap(df_work, ESTIMATION_SAMPLE_SIZE) df_education = stats.bootstrap(df_education, ESTIMATION_SAMPLE_SIZE) return dict(work = df_work, education = df_education)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size): marginals.prepare_classes(df) person_marginals.append( stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None)) household_marginals.append( stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column=None)) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) person_marginals = stats.apply_per_marginal( person_marginals, stats.analyze_sample_and_flatten) household_marginals = stats.apply_per_marginal( household_marginals, stats.analyze_sample_and_flatten) return dict(person=person_marginals, household=household_marginals)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size): marginals.prepare_classes(df) person_marginals.append( stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None)) household_marginals.append( stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column=None)) person_marginals = stats.collect_marginalized_sample(person_marginals) household_marginals = stats.collect_marginalized_sample( household_marginals) person_marginals = stats.bootstrap_sampled_marginals( person_marginals, ESTIMATION_SAMPLE_SIZE) household_marginals = stats.bootstrap_sampled_marginals( household_marginals, ESTIMATION_SAMPLE_SIZE) return dict(person=person_marginals, household=household_marginals)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") data = [] feeder = zip( bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), bs.get_stages(context, "synthesis.population.activities", acquisition_sample_size)) with context.progress(label="Marginalizing chain data ...", total=acquisition_sample_size): with context.parallel() as parallel: data = list(parallel.imap_unordered(execute_parallel, feeder)) data = stats.combine_marginals(data) data = stats.apply_per_marginal(data, stats.analyze_sample_and_flatten) return data
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") data = [] feeder = zip( bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), bs.get_stages(context, "synthesis.population.activities", acquisition_sample_size)) with context.progress(label="Marginalizing chain data ...", total=acquisition_sample_size): with context.parallel() as parallel: data = list(parallel.imap_unordered(execute_parallel, feeder)) data = stats.collect_marginalized_sample(data) data = stats.bootstrap_sampled_marginals(data, ESTIMATION_SAMPLE_SIZE) return data
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") probabilities = np.linspace(0.0, 1.0, 20) quantiles = [] with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size): income = 12 * df_income["household_income"] / df_income["consumption_units"] quantiles.append([income.quantile(p) for p in probabilities]) progress.update() quantiles = np.array(quantiles) mean = np.mean(quantiles, axis = 0) min = np.min(quantiles, axis = 0) max = np.max(quantiles, axis = 0) return pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities))
def execute(context): reference = context.stage("analysis.reference.census.sociodemographics")["person"] output = { marginal: [] for marginal in MARGINALS } total = len(SAMPLING_RATES) * len(MARGINALS) * ACQUISITION_SAMPLE_SIZE with context.progress(label = "Running Monte Carlo analysis ...", total = total) as progress: for sampling_rate in SAMPLING_RATES: partial_marginals = list(bt.get_stages(context, "sample_%f" % sampling_rate, sample_size = ACQUISITION_SAMPLE_SIZE)) with context.parallel(data = dict(partial_marginals = partial_marginals, reference = reference, sampling_rate = sampling_rate)) as parallel: for partial_output in parallel.imap_unordered(process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1)): for marginal in MARGINALS: output[marginal].append(partial_output[marginal]) for marginal in MARGINALS: output[marginal] = pd.concat(output[marginal]) return output
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") probabilities = np.linspace(0.0, 1.0, 20) quantiles = [] with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress: for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size): income = 12 * df_income["household_income"] / df_income["consumption_units"] quantiles.append([income.quantile(p) for p in probabilities]) progress.update() random = np.random.RandomState(0) quantiles = np.array(quantiles) indices = np.random.randint(acquisition_sample_size, size = ESTIMATION_SAMPLE_SIZE) mean = np.mean(quantiles[indices,:], axis = 0) q5 = np.percentile(quantiles[indices,:], 5, axis = 0) q95 = np.percentile(quantiles[indices,:], 95, axis = 0) return pd.DataFrame(dict(mean = mean, q5 = q5, q95 = q95, cdf = probabilities))
def execute(context): df_codes = context.stage("data.spatial.municipalities")[[ "commune_id", "departement_id" ]] acquisition_sample_size = context.config("acquisition_sample_size") feeder = zip( bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size), bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size), bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size), ) work_flows = [] education_flows = [] with context.progress(label="Processing commute data ...", total=acquisition_sample_size) as progress: for realization, (df_home, df_spatial, df_persons) in enumerate(feeder): # Prepare home df_home = pd.merge(df_persons[["person_id", "household_id"]], df_home, on="household_id") df_home = df_home[["person_id", "departement_id" ]].rename(columns={"departement_id": "home"}) # Prepare work df_work = df_spatial[0] df_work = pd.merge(df_work, df_codes, how="left", on="commune_id") df_work["departement_id"] = df_work[ "departement_id"].cat.remove_unused_categories() df_work = df_work[["person_id", "departement_id" ]].rename(columns={"departement_id": "work"}) # Calculate work df_work = pd.merge(df_home, df_work, on="person_id").groupby( ["home", "work"]).size().reset_index(name="weight") df_work["realization"] = realization work_flows.append(df_work) # Prepare work df_education = df_spatial[1] df_education = pd.merge(df_education, df_codes, how="left", on="commune_id") df_education["departement_id"] = df_education[ "departement_id"].cat.remove_unused_categories() df_education = df_education[[ "person_id", "departement_id" ]].rename(columns={"departement_id": "education"}) # Calculate education df_education = pd.merge(df_home, df_education, on="person_id").groupby([ "home", "education" ]).size().reset_index(name="weight") df_education["realization"] = realization education_flows.append(df_education) progress.update() df_work = pd.concat(work_flows) df_education = pd.concat(education_flows) df_work = stats.analyze_sample_and_flatten(df_work) df_education = stats.analyze_sample_and_flatten(df_education) return dict(work=df_work, education=df_education)
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather marginal information df_data = [] for sampling_rate in SAMPLING_RATES: df_marginals = [] for df_stage in bt.get_stages(context, "sample_%f" % sampling_rate, sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_stage = stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL] df_stage["sampling_rate"] = sampling_rate df_marginals.append(df_stage) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] df_data.append(df_marginals) df_data = pd.concat(df_data) sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sampling_rate in SAMPLING_RATES: for sample_size in context.progress( sample_sizes, label="Calculating sample sizes ..."): df_marginals = df_data[df_data["sampling_rate"] == sampling_rate] df_marginals = df_marginals.drop(columns=["sampling_rate"]) df_bootstrap = stats.bootstrap( df_marginals, ESTIMATION_SAMPLES, sample_size, metrics={ "mean": "mean", "q5": lambda x: x.quantile(0.05), "q95": lambda x: x.quantile(0.95), "precision": lambda x: np.mean( np.abs(x / sampling_rate - reference) / reference <= ERROR_THRESHOLD) }) df_bootstrap["sample_size"] = sample_size df_bootstrap["sampling_rate"] = sampling_rate df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) # Plotting plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) for index, sampling_rate in enumerate(SAMPLING_RATES): df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate] plt.plot(df_rate["sample_size"], df_rate["precision"], label=SAMPLING_RATE_LABELS[sampling_rate], color=SAMPLING_RATE_COLORS[sampling_rate]) plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:') plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.ylim([0, 1.05]) plt.xlabel("Number of seeds $K$") plt.ylabel(r"Error probability") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", title="Sampling rate $s$") plt.tight_layout() plt.savefig("%s/error_probability.pdf" % context.path()) plt.close()
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather information df_marginals = [] for df_stage in bt.get_stages(context, "sample", sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_marginals.append( stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL]) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sample_size in context.progress(sample_sizes, label="Calculating sample sizes ..."): df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES, sample_size) df_bootstrap["sample_size"] = sample_size df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) df_figure["mean"] /= SAMPLING_RATE df_figure["q5"] /= SAMPLING_RATE df_figure["q95"] /= SAMPLING_RATE # Prepare plot plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) plt.fill_between(df_figure["sample_size"], df_figure["q5"], df_figure["q95"], alpha=0.25, label="90% Conf.", color=plotting.COLORSET[0], linewidth=0.0) plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2, 'k--', label="Ref. $w$") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2, 'k:', label="1% Err.") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:') plt.plot(df_figure["sample_size"], df_figure["mean"], label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$", color=plotting.COLORSET[0]) plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.xlabel("Number of seeds $K$") plt.ylabel("Stratum weight") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", ncol=2) plt.tight_layout() plt.savefig("%s/sample_count.pdf" % context.path()) plt.close()