示例#1
0
def run_full_models_for_mcmc(burn_in: int, src_db_path: str, dest_db_path: str,
                             build_model, params: dict):
    """
    Run the full baseline model and all scenarios for all accepted MCMC runs in src db.
    """
    src_db = Database(src_db_path)
    dest_db = Database(dest_db_path)

    logger.info("Copying mcmc_run table to %s", dest_db_path)
    mcmc_run_df = src_db.query("mcmc_run")

    # Apply burn in and save to destination
    burned_runs_str = ", ".join(mcmc_run_df[:burn_in].idx)
    logger.info("Burned MCMC runs %s", burned_runs_str)
    mcmc_run_df = mcmc_run_df[burn_in:]
    dest_db.dump_df("mcmc_run", mcmc_run_df)

    mcmc_runs = list(mcmc_run_df.T.to_dict().values())
    for mcmc_run in mcmc_runs:
        meta = {k: v for k, v in mcmc_run.items() if k in META_COLS}
        if not meta["accept"]:
            logger.info("Ignoring non-accepted MCMC run %s", meta["idx"])
            continue

        logger.info("Running full model for MCMC run %s", meta["idx"])
        param_updates = {
            k: v
            for k, v in mcmc_run.items() if k not in META_COLS
        }

        run_idx = meta["idx"].split("_")[-1]

        def update_func(ps: dict):
            return update_params(ps, param_updates)

        with Timer("Running model scenarios"):
            num_scenarios = 1 + len(params["scenarios"].keys())
            scenarios = []
            for scenario_idx in range(num_scenarios):
                scenario = Scenario(build_model, scenario_idx, params)
                scenarios.append(scenario)

            # Run the baseline scenario.
            baseline_scenario = scenarios[0]
            baseline_scenario.run(update_func=update_func)
            baseline_model = baseline_scenario.model

            # Run all the other scenarios
            for scenario in scenarios[1:]:
                scenario.run(base_model=baseline_model,
                             update_func=update_func)

        with Timer("Saving model outputs to the database"):
            models = [s.model for s in scenarios]
            store_run_models(models, dest_db_path, run_idx=run_idx)

    logger.info("Finished running full models for all accepted MCMC runs.")
示例#2
0
def preprocess_demography(input_db: Database):
    loc_df = read_location_df()
    pop_df = read_population_df(loc_df)
    birth_df = read_crude_birth_df(loc_df)
    death_df = read_death_df(loc_df)
    expect_df = read_life_expectancy_df(loc_df)
    input_db.dump_df("countries", loc_df)
    input_db.dump_df("population", pop_df)
    input_db.dump_df("birth_rates", birth_df)
    input_db.dump_df("deaths", death_df)
    input_db.dump_df("life_expectancy", expect_df)
    return loc_df
示例#3
0
def preprocess_mobility(input_db: Database, country_df):
    """
    Read Google Mobility data from CSV into input database
    """
    mob_df = pd.read_csv(MOBILITY_CSV_PATH)

    dhhs_cluster_mobility = reshape_to_clusters(mob_df)

    # Drop all sub-region 2 data, too detailed.
    major_region_mask = mob_df["sub_region_2"].isnull() & mob_df["metro_area"].isnull()
    davao_mask = mob_df.metro_area == "Davao City Metropolitan Area"
    mob_df = mob_df[major_region_mask | davao_mask].copy()

    # These two regions are the same
    mob_df.loc[(mob_df.sub_region_1 == "National Capital Region"), "sub_region_1"] = "Metro Manila"
    mob_df.loc[(mob_df.metro_area == "Davao City Metropolitan Area"), "sub_region_1"] = "Davao City"
    mob_df.loc[
        (mob_df.sub_region_1 == "Federal Territory of Kuala Lumpur"), "sub_region_1"
    ] = "Kuala Lumpur"

    mob_df = mob_df.append(dhhs_cluster_mobility)

    # Drop all rows that have NA values in 1 or more mobility columns.
    mob_cols = [c for c in mob_df.columns if c.endswith(MOBILITY_SUFFIX)]
    mask = False
    for c in mob_cols:
        mask = mask | mob_df[c].isnull()

    mob_df = mob_df[~mask].copy()
    for c in mob_cols:
        # Convert percent values to decimal: 1.0 being no change.
        mob_df[c] = mob_df[c].apply(lambda x: 1 + x / 100)

    # Drop unused columns, rename kept columns
    cols_to_keep = [*mob_cols, "country_region", "sub_region_1", "date"]
    cols_to_drop = [c for c in mob_df.columns if not c in cols_to_keep]
    mob_df = mob_df.drop(columns=cols_to_drop)
    mob_col_rename = {c: c.replace(MOBILITY_SUFFIX, "") for c in mob_cols}
    mob_df.rename(columns={**mob_col_rename, "sub_region_1": "region"}, inplace=True)

    # Convert countries to ISO3
    countries = mob_df["country_region"].unique().tolist()
    iso3s = {c: get_iso3(c, country_df) for c in countries}
    iso3_series = mob_df["country_region"].apply(lambda c: iso3s[c])
    mob_df.insert(0, "iso3", iso3_series)
    mob_df = mob_df.drop(columns=["country_region"])

    mob_df = mob_df.sort_values(["iso3", "region", "date"])
    input_db.dump_df("mobility", mob_df)
示例#4
0
def preprocess_social_mixing(input_db: Database, country_df):
    for location in LOCATIONS:
        for sheet_number, header_arg in SHEET_NUMBERS:
            sheet_name = f"MUestimates_{location}_{sheet_number}.xlsx"
            sheet_path = os.path.join(MIXING_DIRPATH, sheet_name)
            xl = pd.ExcelFile(sheet_path)
            sheet_names = xl.sheet_names
            iso3s = [get_iso3(n, country_df) for n in sheet_names]
            for idx, sheet_name in enumerate(sheet_names):
                iso3 = iso3s[idx]
                mix_df = pd.read_excel(xl,
                                       header=header_arg,
                                       sheet_name=sheet_name)
                if sheet_number == "2":
                    renames = {n - 1: f"X{n}" for n in range(1, 17)}
                    mix_df.rename(columns=renames, inplace=True)

                mix_df.insert(0, "location",
                              [location for _ in range(len(mix_df))])
                mix_df.insert(0, "iso3", [iso3 for _ in range(len(mix_df))])
                input_db.dump_df("social_mixing", mix_df)
示例#5
0
def test_plot_uncertainty(tmp_path):
    """
    Ensure uncertainty plotting code works.
    """
    output_dir = tmp_path
    powerbi_db_path = os.path.join(tmp_path, "powerbi.db")
    targets = {
        "incidence": {
            "output_key": "incidence",
            "title": "incidence",
            "times": [],
            "values": [],
            "quantiles": [0.25, 0.5, 0.75],
        },
        "foo": {
            "output_key": "foo",
            "title": "foo",
            "times": [],
            "values": [],
            "quantiles": [0.25, 0.5, 0.75],
        },
    }
    funcs = [
        lambda t: 2 * t + random.random(), lambda t: t**3 + random.random()
    ]
    # Build data for plotting
    do_df, mcmc_df, _ = build_synthetic_calibration(targets,
                                                    funcs,
                                                    chains=2,
                                                    runs=20,
                                                    times=20)
    unc_df = calculate_mcmc_uncertainty(mcmc_df, do_df, targets)
    # Create database for plotting
    db = Database(powerbi_db_path)
    db.dump_df("mcmc_run", mcmc_df)
    db.dump_df("derived_outputs", do_df)
    db.dump_df("uncertainty", unc_df)
    # Create plots
    plot_uncertainty(targets, powerbi_db_path, output_dir)
    # Check plots
    expected_foo_path = os.path.join(tmp_path, "foo", "uncertainty-foo-0.png")
    expected_incidence_path = os.path.join(tmp_path, "incidence",
                                           "uncertainty-incidence-0.png")
    assert os.path.exists(expected_foo_path)
    assert os.path.exists(expected_incidence_path)
示例#6
0
def preprocess_our_world_in_data(input_db: Database):
    df = pd.read_csv(OUR_WORLD_IN_DATA_CSV_PATH)

    # Replace the one strange value for test numbers in Malaysia
    df.loc[(df.iso_code == "MYS") & (df.new_tests > 1e5), "new_tests"] = np.nan
    input_db.dump_df("owid", df)
示例#7
0
def preprocess_covid_phl(input_db: Database):

    df = pd.read_csv(COVID_PHL_CSV_PATH)
    df = create_region_aggregates(df)
    input_db.dump_df("covid_phl", df)
示例#8
0
def preprocess_covid_au(input_db: Database):
    df = pd.read_csv(COVID_AU_CSV_PATH)
    input_db.dump_df("covid_au", df)
    df = pd.read_csv(COVID_LGA_CSV_PATH)
    df = reshape_to_clusters(df)
    input_db.dump_df("covid_dhhs_test", df)
示例#9
0
def preprocess_social_mixing(input_db: Database, country_df):
    for location in LOCATIONS:
        for sheet_number, header_arg in SHEET_NUMBERS:
            sheet_name = f"MUestimates_{location}_{sheet_number}.xlsx"
            sheet_path = os.path.join(MIXING_DIRPATH, sheet_name)
            xl = pd.ExcelFile(sheet_path)
            sheet_names = xl.sheet_names
            iso3s = [get_iso3(n, country_df) for n in sheet_names]
            for idx, sheet_name in enumerate(sheet_names):
                iso3 = iso3s[idx]
                mix_df = pd.read_excel(xl,
                                       header=header_arg,
                                       sheet_name=sheet_name)
                if sheet_number == "2":
                    renames = {n - 1: f"X{n}" for n in range(1, 17)}
                    mix_df.rename(columns=renames, inplace=True)

                mix_df.insert(0, "location",
                              [location for _ in range(len(mix_df))])
                mix_df.insert(0, "iso3", [iso3 for _ in range(len(mix_df))])
                input_db.dump_df("social_mixing", mix_df)

    # Next gen social mixing
    original_mm = input_db.query("social_mixing")

    df = pd.read_csv(
        os.path.join(MIXING_DIRPATH, "synthetic_contacts_2020.csv"))
    df = df[df.setting == "overall"]
    df.drop(columns="setting", inplace=True)
    df.replace(
        {
            "0 to 4": "00 to 04",
            "5 to 9": "05 to 09",
            "all": "all_locations",
            "others": "other_locations",
        },
        inplace=True,
    )

    # The contactor is in j (columns) and the contactee is in i (rows)
    df = df.pivot_table(
        index=["iso3c", "location_contact", "age_cotactee"],
        columns="age_contactor",
        values="mean_number_of_contacts",
    )
    df = df.reset_index()
    df.drop(columns="age_cotactee", inplace=True)

    cols = list(df.columns[2:])
    new_col = ["X" + str(x) for x in range(1, len(cols) + 1)]
    replace_col = dict(zip(cols, new_col))
    df.rename(columns=replace_col, inplace=True)
    df.rename(columns={
        "iso3c": "iso3",
        "location_contact": "location"
    },
              inplace=True)

    iso3_diff = set(original_mm.iso3).difference(df.iso3)
    iso3_mask = original_mm.iso3.isin(iso3_diff)
    df = df.append(original_mm[iso3_mask], ignore_index=True)

    input_db.dump_df("social_mixing_2020", df)
示例#10
0
def test_plot_post_calibration(tmp_path):
    plot_dir = tmp_path
    mcmc_dir_path = os.path.join(tmp_path, "mcmc")
    os.makedirs(mcmc_dir_path)
    targets = {
        "incidence": {
            "output_key": "incidence",
            "title": "incidence",
            "times": [],
            "values": [],
            "quantiles": [0.25, 0.5, 0.75],
        },
        "foo": {
            "output_key": "foo",
            "title": "foo",
            "times": [],
            "values": [],
            "quantiles": [0.25, 0.5, 0.75],
        },
    }

    # A dummy prior to pass postirior checks
    priors = [{
        "param_name": "contact_rate",
        "distribution": "uniform",
        "distri_params": [0.01, 0.03]
    }]

    funcs = [
        lambda t: 2 * t + random.random(), lambda t: t**3 + random.random()
    ]
    # Build data for plotting
    do_df, mcmc_df, params_df = build_synthetic_calibration(targets,
                                                            funcs,
                                                            chains=2,
                                                            runs=20,
                                                            times=20)
    chains = set(mcmc_df["chain"].tolist())
    # Create databases for plotting
    for chain in chains:
        db_path = os.path.join(mcmc_dir_path, f"chain-{chain}.db")
        db = Database(db_path)
        db.dump_df("mcmc_run", mcmc_df[mcmc_df["chain"] == chain])
        db.dump_df("mcmc_params", params_df[params_df["chain"] == chain])
        db.dump_df("derived_outputs", do_df[do_df["chain"] == chain])

    # Create plots
    plot_post_calibration(targets, mcmc_dir_path, plot_dir, priors)

    # Check plots - do a super basic check
    expected_files = [
        "burn-in.png",
        "loglikelihood-traces.png",
        "acceptance_ratio.png",
        "params-traces",
        "calibration-fit",
        "params-vs-loglikelihood",
        "posteriors",
    ]
    for fname in expected_files:
        p = os.path.join(plot_dir, fname)
        assert os.path.exists(p)
        if os.path.isdir(p):
            assert len(os.listdir(p)) > 0