def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    feeder = zip(
        bs.get_stages(context, "synthesis.population.spatial.home.locations",
                      acquisition_sample_size),
        bs.get_stages(context,
                      "synthesis.population.spatial.primary.locations",
                      acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.sampled",
                      acquisition_sample_size),
    )

    probabilities = np.linspace(0.0, 1.0, 20)
    quantiles = {"work": [], "education": []}

    with context.progress(label="Processing commute data ...",
                          total=acquisition_sample_size) as progress:
        for df_home, df_spatial, df_persons in feeder:
            # Prepare home
            df_home = pd.merge(df_home,
                               df_persons[["person_id", "household_id"]],
                               on="household_id")
            df_home = df_home[["person_id", "geometry"
                               ]].set_index("person_id").sort_index()
            assert len(df_home) == len(df_persons)

            for index, name in enumerate(("work", "education")):
                df_destination = df_spatial[index]
                df_destination = df_destination[["person_id", "geometry"]]
                df_destination = df_destination.set_index(
                    "person_id").sort_index()

                df_compare = df_home.loc[df_destination.index]
                assert len(df_destination) == len(df_compare)

                distances = df_destination["geometry"].distance(
                    df_compare["geometry"]) * 1e-3

                quantiles[name].append(
                    [distances.quantile(p) for p in probabilities])

            progress.update()

    result = {}
    random = np.random.RandomState(0)

    for name in ("work", "education"):
        data = np.array(quantiles[name])
        indices = np.random.randint(acquisition_sample_size,
                                    size=ESTIMATION_SAMPLE_SIZE)

        mean = np.mean(data[indices, :], axis=0)
        q5 = np.percentile(data[indices, :], 5, axis=0)
        q95 = np.percentile(data[indices, :], 95, axis=0)

        df = pd.DataFrame(dict(mean=mean, q5=q5, q95=q95, cdf=probabilities))
        result[name] = df

    return result
Пример #2
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    feeder = zip(
        bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size)
    )

    for df, df_home in feeder:
        df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]])
        marginals.prepare_classes(df)

        person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None))
        household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None))

    person_marginals = stats.combine_marginals(person_marginals)
    household_marginals = stats.combine_marginals(household_marginals)

    person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten)
    household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten)

    return dict(person = person_marginals, household = household_marginals)
Пример #3
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    probabilities = np.linspace(0.0, 1.0, 20)
    modes = ["car", "car_passenger", "pt", "bike", "walk"]

    quantiles = { mode : [] for mode in modes }

    generator = zip(
        bs.get_stages(context, "synthesis.population.spatial.locations", acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.trips", acquisition_sample_size)
    )

    with context.progress(label = "Processing distance data ...", total = acquisition_sample_size) as progress:
        for df_locations, df_trips in generator:
            # Load locations and calculate euclidean distances
            df_locations = df_locations[["person_id", "activity_index", "geometry"]].rename(columns = { "activity_index": "trip_index" })
            df_locations["euclidean_distance"] = df_locations["geometry"].distance(df_locations["geometry"].shift(-1))

            # Merge mode into distances
            df_trips = pd.merge(
                df_trips[["person_id", "trip_index", "mode", "preceding_purpose", "following_purpose", "departure_time", "arrival_time"]],
                df_locations, on = ["person_id", "trip_index"], how = "inner"
            )
            df_trips["travel_time"] = df_trips["arrival_time"] - df_trips["departure_time"]

            # Filter trips
            primary_activities = ["home", "work", "education"]
            #primary_activities = []
            df_trips = df_trips[~(
                df_trips["preceding_purpose"].isin(primary_activities) &
                df_trips["following_purpose"].isin(primary_activities)
            )]

            # Calculate quantiles
            for mode in modes:
                df_mode = df_trips[df_trips["mode"] == mode]
                quantiles[mode].append([df_mode["euclidean_distance"].quantile(p) for p in probabilities])

            progress.update()

    for mode in modes:
        quantiles[mode] = np.array(quantiles[mode])

    random = np.random.RandomState(0)
    df_data = []

    for mode in modes:
        indices = np.random.randint(acquisition_sample_size, size = ESTIMATION_SAMPLE_SIZE)

        mean = np.mean(quantiles[mode][indices,:], axis = 0)
        q5 = np.percentile(quantiles[mode][indices,:], 5, axis = 0)
        q95 = np.percentile(quantiles[mode][indices,:], 95, axis = 0)

        df_data.append(pd.DataFrame(dict(mean = mean, q5 = q5, q95 = q95, cdf = probabilities)))
        df_data[-1]["mode"] = mode

    return pd.concat(df_data)
Пример #4
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    feeder = zip(
        bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.spatial.primary.locations", acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.sampled", acquisition_sample_size),
    )

    work_flows = []
    education_flows = []

    with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
        for sample, (df_home, df_spatial, df_persons) in enumerate(feeder):
            # Prepare home
            df_home = pd.merge(df_persons[["person_id", "household_id"]], df_home, on = "household_id")
            df_home = df_home[["person_id", "departement_id"]].rename(columns = { "departement_id": "home" })

            # Prepare work
            df_work = df_spatial[0]
            df_work["departement_id"] = df_work["commune_id"] // 1000
            df_work = df_work[["person_id", "departement_id"]].rename(columns = { "departement_id": "work" })

            # Calculate work
            df_work = pd.merge(df_home, df_work, on = "person_id").groupby(["home", "work"]).size().reset_index(name = "weight")
            df_work = df_work.reset_index()
            df_work["sample"] = sample
            work_flows.append(df_work)

            # Prepare work
            df_education = df_spatial[1]
            df_education["departement_id"] = df_education["commune_id"] // 1000
            df_education = df_education[["person_id", "departement_id"]].rename(columns = { "departement_id": "education" })

            # Calculate education
            df_education = pd.merge(df_home, df_education, on = "person_id").groupby(["home", "education"]).size().reset_index(name = "weight")
            df_education = df_education.reset_index()
            df_education["sample"] = sample
            education_flows.append(df_education)

            progress.update()

    df_work = pd.concat(work_flows)
    df_education = pd.concat(education_flows)

    df_work = stats.bootstrap(df_work, ESTIMATION_SAMPLE_SIZE)
    df_education = stats.bootstrap(df_education, ESTIMATION_SAMPLE_SIZE)

    return dict(work = df_work, education = df_education)
Пример #5
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    for df in bs.get_stages(context, "synthesis.population.enriched",
                            acquisition_sample_size):
        marginals.prepare_classes(df)

        person_marginals.append(
            stats.marginalize(df,
                              marginals.ANALYSIS_PERSON_MARGINALS,
                              weight_column=None))
        household_marginals.append(
            stats.marginalize(df.drop_duplicates("household_id"),
                              marginals.ANALYSIS_HOUSEHOLD_MARGINALS,
                              weight_column=None))

    person_marginals = stats.combine_marginals(person_marginals)
    household_marginals = stats.combine_marginals(household_marginals)

    person_marginals = stats.apply_per_marginal(
        person_marginals, stats.analyze_sample_and_flatten)
    household_marginals = stats.apply_per_marginal(
        household_marginals, stats.analyze_sample_and_flatten)

    return dict(person=person_marginals, household=household_marginals)
Пример #6
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    for df in bs.get_stages(context, "synthesis.population.enriched",
                            acquisition_sample_size):
        marginals.prepare_classes(df)

        person_marginals.append(
            stats.marginalize(df,
                              marginals.ANALYSIS_PERSON_MARGINALS,
                              weight_column=None))
        household_marginals.append(
            stats.marginalize(df.drop_duplicates("household_id"),
                              marginals.ANALYSIS_HOUSEHOLD_MARGINALS,
                              weight_column=None))

    person_marginals = stats.collect_marginalized_sample(person_marginals)
    household_marginals = stats.collect_marginalized_sample(
        household_marginals)

    person_marginals = stats.bootstrap_sampled_marginals(
        person_marginals, ESTIMATION_SAMPLE_SIZE)
    household_marginals = stats.bootstrap_sampled_marginals(
        household_marginals, ESTIMATION_SAMPLE_SIZE)

    return dict(person=person_marginals, household=household_marginals)
Пример #7
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    data = []

    feeder = zip(
        bs.get_stages(context, "synthesis.population.sampled",
                      acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.activities",
                      acquisition_sample_size))

    with context.progress(label="Marginalizing chain data ...",
                          total=acquisition_sample_size):
        with context.parallel() as parallel:
            data = list(parallel.imap_unordered(execute_parallel, feeder))

    data = stats.combine_marginals(data)
    data = stats.apply_per_marginal(data, stats.analyze_sample_and_flatten)

    return data
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    data = []

    feeder = zip(
        bs.get_stages(context, "synthesis.population.sampled",
                      acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.activities",
                      acquisition_sample_size))

    with context.progress(label="Marginalizing chain data ...",
                          total=acquisition_sample_size):
        with context.parallel() as parallel:
            data = list(parallel.imap_unordered(execute_parallel, feeder))

    data = stats.collect_marginalized_sample(data)
    data = stats.bootstrap_sampled_marginals(data, ESTIMATION_SAMPLE_SIZE)

    return data
Пример #9
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    probabilities = np.linspace(0.0, 1.0, 20)
    quantiles = []

    with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
        for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size):
            income = 12 * df_income["household_income"] / df_income["consumption_units"]
            quantiles.append([income.quantile(p) for p in probabilities])
            progress.update()

    quantiles = np.array(quantiles)

    mean = np.mean(quantiles, axis = 0)
    min = np.min(quantiles, axis = 0)
    max = np.max(quantiles, axis = 0)

    return pd.DataFrame(dict(mean = mean, min = min, max = max, cdf = probabilities))
Пример #10
0
def execute(context):
    reference = context.stage("analysis.reference.census.sociodemographics")["person"]

    output = { marginal: [] for marginal in MARGINALS }
    total = len(SAMPLING_RATES) * len(MARGINALS) * ACQUISITION_SAMPLE_SIZE

    with context.progress(label = "Running Monte Carlo analysis ...", total = total) as progress:
        for sampling_rate in SAMPLING_RATES:
            partial_marginals = list(bt.get_stages(context, "sample_%f" % sampling_rate, sample_size = ACQUISITION_SAMPLE_SIZE))

            with context.parallel(data = dict(partial_marginals = partial_marginals, reference = reference, sampling_rate = sampling_rate)) as parallel:

                for partial_output in parallel.imap_unordered(process, np.arange(1, ACQUISITION_SAMPLE_SIZE + 1)):
                    for marginal in MARGINALS:
                        output[marginal].append(partial_output[marginal])

    for marginal in MARGINALS:
        output[marginal] = pd.concat(output[marginal])

    return output
Пример #11
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    probabilities = np.linspace(0.0, 1.0, 20)
    quantiles = []

    with context.progress(label = "Processing commute data ...", total = acquisition_sample_size) as progress:
        for df_income in bs.get_stages(context, "synthesis.population.income", acquisition_sample_size):
            income = 12 * df_income["household_income"] / df_income["consumption_units"]
            quantiles.append([income.quantile(p) for p in probabilities])
            progress.update()

    random = np.random.RandomState(0)

    quantiles = np.array(quantiles)
    indices = np.random.randint(acquisition_sample_size, size = ESTIMATION_SAMPLE_SIZE)

    mean = np.mean(quantiles[indices,:], axis = 0)
    q5 = np.percentile(quantiles[indices,:], 5, axis = 0)
    q95 = np.percentile(quantiles[indices,:], 95, axis = 0)

    return pd.DataFrame(dict(mean = mean, q5 = q5, q95 = q95, cdf = probabilities))
Пример #12
0
def execute(context):
    df_codes = context.stage("data.spatial.municipalities")[[
        "commune_id", "departement_id"
    ]]

    acquisition_sample_size = context.config("acquisition_sample_size")

    feeder = zip(
        bs.get_stages(context, "synthesis.population.spatial.home.zones",
                      acquisition_sample_size),
        bs.get_stages(context,
                      "synthesis.population.spatial.primary.locations",
                      acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.sampled",
                      acquisition_sample_size),
    )

    work_flows = []
    education_flows = []

    with context.progress(label="Processing commute data ...",
                          total=acquisition_sample_size) as progress:
        for realization, (df_home, df_spatial,
                          df_persons) in enumerate(feeder):
            # Prepare home
            df_home = pd.merge(df_persons[["person_id", "household_id"]],
                               df_home,
                               on="household_id")
            df_home = df_home[["person_id", "departement_id"
                               ]].rename(columns={"departement_id": "home"})

            # Prepare work
            df_work = df_spatial[0]
            df_work = pd.merge(df_work, df_codes, how="left", on="commune_id")
            df_work["departement_id"] = df_work[
                "departement_id"].cat.remove_unused_categories()
            df_work = df_work[["person_id", "departement_id"
                               ]].rename(columns={"departement_id": "work"})

            # Calculate work
            df_work = pd.merge(df_home, df_work, on="person_id").groupby(
                ["home", "work"]).size().reset_index(name="weight")
            df_work["realization"] = realization
            work_flows.append(df_work)

            # Prepare work
            df_education = df_spatial[1]
            df_education = pd.merge(df_education,
                                    df_codes,
                                    how="left",
                                    on="commune_id")
            df_education["departement_id"] = df_education[
                "departement_id"].cat.remove_unused_categories()
            df_education = df_education[[
                "person_id", "departement_id"
            ]].rename(columns={"departement_id": "education"})

            # Calculate education
            df_education = pd.merge(df_home, df_education,
                                    on="person_id").groupby([
                                        "home", "education"
                                    ]).size().reset_index(name="weight")
            df_education["realization"] = realization
            education_flows.append(df_education)

            progress.update()

    df_work = pd.concat(work_flows)
    df_education = pd.concat(education_flows)

    df_work = stats.analyze_sample_and_flatten(df_work)
    df_education = stats.analyze_sample_and_flatten(df_education)

    return dict(work=df_work, education=df_education)
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather marginal information
    df_data = []

    for sampling_rate in SAMPLING_RATES:
        df_marginals = []

        for df_stage in bt.get_stages(context,
                                      "sample_%f" % sampling_rate,
                                      sample_size=ACQUISITION_SAMPLE_SIZE):
            marginals.prepare_classes(df_stage)
            df_stage = stats.marginalize(df_stage, [MARGINAL],
                                         weight_column=None)[MARGINAL]
            df_stage["sampling_rate"] = sampling_rate
            df_marginals.append(df_stage)

        df_marginals = stats.collect_sample(df_marginals)
        df_marginals = df_marginals[np.logical_and.reduce([
            df_marginals[name] == value
            for name, value in zip(MARGINAL, VALUES)
        ])]

        df_data.append(df_marginals)

    df_data = pd.concat(df_data)

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sampling_rate in SAMPLING_RATES:
        for sample_size in context.progress(
                sample_sizes, label="Calculating sample sizes ..."):
            df_marginals = df_data[df_data["sampling_rate"] == sampling_rate]
            df_marginals = df_marginals.drop(columns=["sampling_rate"])

            df_bootstrap = stats.bootstrap(
                df_marginals,
                ESTIMATION_SAMPLES,
                sample_size,
                metrics={
                    "mean":
                    "mean",
                    "q5":
                    lambda x: x.quantile(0.05),
                    "q95":
                    lambda x: x.quantile(0.95),
                    "precision":
                    lambda x: np.mean(
                        np.abs(x / sampling_rate - reference) / reference <=
                        ERROR_THRESHOLD)
                })

            df_bootstrap["sample_size"] = sample_size
            df_bootstrap["sampling_rate"] = sampling_rate

            df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    # Plotting
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    for index, sampling_rate in enumerate(SAMPLING_RATES):
        df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate]
        plt.plot(df_rate["sample_size"],
                 df_rate["precision"],
                 label=SAMPLING_RATE_LABELS[sampling_rate],
                 color=SAMPLING_RATE_COLORS[sampling_rate])

    plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:')

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.ylim([0, 1.05])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel(r"Error probability")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", title="Sampling rate $s$")

    plt.tight_layout()
    plt.savefig("%s/error_probability.pdf" % context.path())
    plt.close()
Пример #14
0
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather information
    df_marginals = []

    for df_stage in bt.get_stages(context,
                                  "sample",
                                  sample_size=ACQUISITION_SAMPLE_SIZE):
        marginals.prepare_classes(df_stage)
        df_marginals.append(
            stats.marginalize(df_stage, [MARGINAL],
                              weight_column=None)[MARGINAL])

    df_marginals = stats.collect_sample(df_marginals)
    df_marginals = df_marginals[np.logical_and.reduce([
        df_marginals[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sample_size in context.progress(sample_sizes,
                                        label="Calculating sample sizes ..."):
        df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES,
                                       sample_size)
        df_bootstrap["sample_size"] = sample_size
        df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    df_figure["mean"] /= SAMPLING_RATE
    df_figure["q5"] /= SAMPLING_RATE
    df_figure["q95"] /= SAMPLING_RATE

    # Prepare plot
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    plt.fill_between(df_figure["sample_size"],
                     df_figure["q5"],
                     df_figure["q95"],
                     alpha=0.25,
                     label="90% Conf.",
                     color=plotting.COLORSET[0],
                     linewidth=0.0)
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2,
             'k--',
             label="Ref. $w$")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2,
             'k:',
             label="1% Err.")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:')
    plt.plot(df_figure["sample_size"],
             df_figure["mean"],
             label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$",
             color=plotting.COLORSET[0])

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel("Stratum weight")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", ncol=2)

    plt.tight_layout()
    plt.savefig("%s/sample_count.pdf" % context.path())
    plt.close()