Exemplo n.º 1
0
def load_derived_output_tables(calib_dirpath: str):
    derived_output_tables = []
    for db_path in _find_db_paths(calib_dirpath):
        db = Database(db_path)
        derived_output_tables.append(db.query("derived_outputs"))

    return derived_output_tables
Exemplo n.º 2
0
def plot_timeseries_with_uncertainty_for_powerbi(
    region_name: str, powerbi_db_path: str, output_dir: str
):
    """
    works on powerbi version
    Assumes a COVID model.
    TODO: Unify PowerBI and local version
    """
    os.makedirs(output_dir, exist_ok=True)
    plot_config = load_plot_config(region_name)
    db = Database(powerbi_db_path)
    uncertainty_df = db.query("uncertainty")
    outputs = uncertainty_df["type"].unique().tolist()
    quantile_vals = uncertainty_df["quantile"].unique().tolist()
    for output_name in outputs:
        this_output_dir = os.path.join(output_dir, output_name)
        os.makedirs(this_output_dir, exist_ok=True)
        plotter = FilePlotter(this_output_dir, plot_config["translations"])
        mask = uncertainty_df["type"] == output_name
        output_df = uncertainty_df[mask]
        scenarios = output_df.Scenario.unique().tolist()
        for scenario in scenarios:
            mask = output_df["Scenario"] == scenario
            scenario_df = output_df[mask]
            quantiles = {}
            for q in quantile_vals:
                mask = scenario_df["quantile"] == q
                quantiles[q] = scenario_df[mask]["value"].tolist()

            times = scenario_df.time.unique()
            logger.info("Plotting uncertainty for output %s, scenario %s", output_name, scenario)
            plots.plot_timeseries_with_uncertainty_for_powerbi(
                plotter, output_name, scenario, quantiles, times, plot_config
            )
Exemplo n.º 3
0
def load_mcmc_tables(calib_dirpath: str):
    mcmc_tables = []
    for db_path in _find_db_paths(calib_dirpath):
        db = Database(db_path)
        mcmc_tables.append(db.query("mcmc_run"))

    return mcmc_tables
Exemplo n.º 4
0
def test_calibrate_autumn_mcmc(temp_data_dir):
    # Import autumn stuff inside function so we can mock out the database.
    priors = [{
        "param_name": "ice_cream_sales",
        "distribution": "uniform",
        "distri_params": [1, 5],
    }]
    target_outputs = [{
        "output_key": "shark_attacks",
        "years": [2000, 2001, 2002, 2003, 2004],
        "values": [3, 6, 9, 12, 15],
        "loglikelihood_distri": "poisson",
    }]
    multipliers = {}
    params = {
        "default": {
            "start_time": 2000
        },
        "scenario_start_time": 2000,
        "scenarios": {},
    }
    calib = Calibration(
        "sharks",
        _build_mock_model,
        params,
        priors,
        target_outputs,
        multipliers,
        1,
        1,
    )
    calib.run_fitting_algorithm(
        run_mode=CalibrationMode.AUTUMN_MCMC,
        n_iterations=50,
        n_burned=10,
        n_chains=1,
        available_time=1e6,
    )
    app_dir = os.path.join(temp_data_dir, "outputs", "calibrate", "sharks",
                           "main")
    run_dir = os.path.join(app_dir, os.listdir(app_dir)[0])
    db_fname = [
        fname for fname in os.listdir(run_dir) if fname.endswith(".db")
    ][0]
    out_db_path = os.path.join(run_dir, db_fname)
    assert os.path.exists(out_db_path)

    out_db = Database(out_db_path)
    assert set(out_db.engine.table_names()) == {
        "outputs",
        "derived_outputs",
        "mcmc_run",
    }
    mcmc_runs = out_db.query("mcmc_run")
    max_idx = mcmc_runs.loglikelihood.idxmax()
    best_run = mcmc_runs.iloc[max_idx]
    ice_cream_sales_mle = best_run.ice_cream_sales
    # This value is non-deterministic due to fixed seed.
    assert 2.9 < ice_cream_sales_mle < 3.1
Exemplo n.º 5
0
def load_output_tables(calib_dirpath: str):
    output_tables = []
    for db_path in find_db_paths(calib_dirpath):
        db = Database(db_path)
        df = db.query("outputs")
        output_tables.append(df)

    return output_tables
Exemplo n.º 6
0
def run_full_models_for_mcmc(burn_in: int, src_db_path: str, dest_db_path: str,
                             build_model, params: dict):
    """
    Run the full baseline model and all scenarios for all accepted MCMC runs in src db.
    """
    src_db = Database(src_db_path)
    dest_db = Database(dest_db_path)

    logger.info("Copying mcmc_run table to %s", dest_db_path)
    mcmc_run_df = src_db.query("mcmc_run")

    # Apply burn in and save to destination
    burned_runs_str = ", ".join(mcmc_run_df[:burn_in].idx)
    logger.info("Burned MCMC runs %s", burned_runs_str)
    mcmc_run_df = mcmc_run_df[burn_in:]
    dest_db.dump_df("mcmc_run", mcmc_run_df)

    mcmc_runs = list(mcmc_run_df.T.to_dict().values())
    for mcmc_run in mcmc_runs:
        meta = {k: v for k, v in mcmc_run.items() if k in META_COLS}
        if not meta["accept"]:
            logger.info("Ignoring non-accepted MCMC run %s", meta["idx"])
            continue

        logger.info("Running full model for MCMC run %s", meta["idx"])
        param_updates = {
            k: v
            for k, v in mcmc_run.items() if k not in META_COLS
        }

        run_idx = meta["idx"].split("_")[-1]

        def update_func(ps: dict):
            return update_params(ps, param_updates)

        with Timer("Running model scenarios"):
            num_scenarios = 1 + len(params["scenarios"].keys())
            scenarios = []
            for scenario_idx in range(num_scenarios):
                scenario = Scenario(build_model, scenario_idx, params)
                scenarios.append(scenario)

            # Run the baseline scenario.
            baseline_scenario = scenarios[0]
            baseline_scenario.run(update_func=update_func)
            baseline_model = baseline_scenario.model

            # Run all the other scenarios
            for scenario in scenarios[1:]:
                scenario.run(base_model=baseline_model,
                             update_func=update_func)

        with Timer("Saving model outputs to the database"):
            models = [s.model for s in scenarios]
            store_run_models(models, dest_db_path, run_idx=run_idx)

    logger.info("Finished running full models for all accepted MCMC runs.")
Exemplo n.º 7
0
def collect_map_estimate(calib_dirpath: str):
    """
    Read all MCMC outputs found in mcmc_db_folder and print the map parameter values.
    :return: dict of parameters
    """
    mcmc_tables = []
    db_paths = [
        os.path.join(calib_dirpath, f) for f in os.listdir(calib_dirpath)
        if f.endswith(".db") and not f.startswith("mcmc_percentiles")
    ]
    for db_path in db_paths:
        db = Database(db_path)
        mcmc_tables.append(
            db.query("mcmc_run").sort_values(by="loglikelihood",
                                             ascending=False))

    print("Maximum loglikelihood for each chain:")
    print([
        mcmc_tables[i]["loglikelihood"].iloc[0]
        for i in range(len(mcmc_tables))
    ])
    print()

    print("Chains' lengths:")
    print([
        len(mcmc_tables[i]["loglikelihood"]) for i in range(len(mcmc_tables))
    ])
    print()

    best_chain_index = np.argmax([
        mcmc_tables[i]["loglikelihood"].iloc[0]
        for i in range(len(mcmc_tables))
    ])
    non_param_cols = ["idx", "Scenario", "loglikelihood", "accept"]
    param_list = [c for c in mcmc_tables[0].columns if c not in non_param_cols]
    map_estimates = {}
    for param in param_list:
        map_estimates[param] = mcmc_tables[best_chain_index][param].iloc[0]
    return map_estimates, best_chain_index
Exemplo n.º 8
0
def plot_uncertainty(targets: dict, powerbi_db_path: str, output_dir: str):
    """
    works on powerbi version
    Assumes a COVID model.
    """
    os.makedirs(output_dir, exist_ok=True)
    db = Database(powerbi_db_path)
    uncertainty_df = db.query("uncertainty")
    outputs = uncertainty_df["type"].unique().tolist()
    for output_name in outputs:
        this_output_dir = os.path.join(output_dir, output_name)
        os.makedirs(this_output_dir, exist_ok=True)
        plotter = FilePlotter(this_output_dir, targets)
        scenario_idxs = uncertainty_df["scenario"].unique().tolist()
        for scenario_idx in scenario_idxs:
            logger.info("Plotting uncertainty for output %s, scenario %s",
                        output_name, scenario_idx)
            if scenario_idx == 0:
                # Just plot the baseline scenario for the full time period.
                scenario_idxs = [0]
                x_low = 0
            else:
                # Plot the baseline compared ot the scenario, but only for the time period
                # where the scenario is active.
                scenario_idxs = [0, scenario_idx]
                mask = uncertainty_df["scenario"] == scenario_idx
                x_low = uncertainty_df[mask]["time"].min()

            plots.plot_timeseries_with_uncertainty(
                plotter,
                uncertainty_df,
                output_name,
                scenario_idxs,
                targets,
                x_low=x_low,
            )
Exemplo n.º 9
0
def test_create_power_bi_outputs(tmp_path):
    """
    Ensure that PowerBI outputs are correctly created from a model output database.
    """
    # Prepare models
    models = [
        get_mock_model(
            times=[2000, 2001, 2002, 2003, 2004, 2005],
            outputs=[
                [1, 2, 3, 4, 5, 6, 7, 8],
                [11, 12, 13, 14, 15, 16, 17, 18],
                [21, 22, 23, 24, 25, 26, 27, 28],
                [31, 32, 33, 34, 35, 36, 37, 38],
                [41, 42, 43, 44, 45, 46, 47, 48],
                [5, 4, 3, 2, 1, 0, -1, -2],
            ],
            derived_outputs={
                "times": [2000, 2001, 2002, 2003, 2004, 2005],
                "snacks": [1, 2, 3, 4, 5, 6],
            },
        ),
        get_mock_model(
            times=[2000, 2001, 2002, 2003, 2004, 2005],
            outputs=[
                [51, 52, 53, 54, 55, 56, 57, 58],
                [61, 62, 63, 64, 65, 66, 67, 68],
                [71, 72, 73, 74, 75, 76, 77, 78],
                [81, 82, 83, 94, 95, 96, 97, 98],
                [91, 92, 93, 84, 85, 86, 87, 88],
                [5, 4, 3, 2, 1, 0, -1, -2],
            ],
            derived_outputs={
                "times": [2000, 2001, 2002, 2003, 2004, 2005],
                "snacks": [7, 8, 9, 10, 11, 12],
            },
        ),
    ]
    mcmc_run_df = pd.DataFrame.from_dict({
        "contact_rate": [5, 10, 6, 4],
        "loglikelihood": [-1, -3, -2, -0.5],
        "accept": [1, 0, 0, 1],
    })
    db_path = os.path.join(tmp_path, "out.db")
    powerbi_db_path = os.path.join(tmp_path, "pbi.db")
    # Store the models
    store_run_models(models, db_path)
    store_database(mcmc_run_df, db_path, "mcmc_run", scenario=0, run_idx=1)
    src_db = Database(db_path)
    mcmc_run_src = src_db.query("mcmc_run")
    derived_outputs_src = src_db.query("derived_outputs")

    # Create Power BI outputs
    create_power_bi_outputs(db_path, powerbi_db_path)
    # Query Power BI outputs
    pbi_db = Database(powerbi_db_path)
    table_0 = pbi_db.query("pbi_scenario_0")
    table_1 = pbi_db.query("pbi_scenario_1")
    mcmc_run_dest = pbi_db.query("mcmc_run")
    derived_outputs_dest = pbi_db.query("derived_outputs")

    # Validate derived_outputs copied over
    assert_frame_equal(derived_outputs_src, derived_outputs_dest)

    # Validate MCMC run copied over
    assert_frame_equal(mcmc_run_src, mcmc_run_dest)

    def get_expected_df(model, scenario):
        outputs_df = pd.DataFrame(model.outputs,
                                  columns=model.compartment_names)
        outputs_df.insert(0, "times", model.times)
        outputs_df.insert(0, "Scenario", scenario)
        outputs_df.insert(0, "idx", "run_0")
        return unpivot_outputs(outputs_df)

    # Validate Power BI outputs transformed correctly
    expected_df = get_expected_df(models[0], "S_0")
    assert_frame_equal(expected_df, table_0)

    expected_df = get_expected_df(models[1], "S_1")
    assert_frame_equal(expected_df, table_1)
Exemplo n.º 10
0
def test_collate_outputs(tmp_path):
    """
    Test the collation of multiple calibration output databases into a single file. 
    """
    # Setup database tables
    mcmc_run_cols = [
        "idx", "Scenario", "ice_cream_sales", "loglikelihood", "accept"
    ]
    mcmc_run_1 = [
        ["run_0", "S_0", 1, -1, 1],
        ["run_1", "S_0", 2, -2, 1],
        ["run_2", "S_0", 3, -3, 0],
        ["run_3", "S_0", 4, -4, 1],
    ]
    mcmc_run_2 = [
        ["run_0", "S_0", 11, -11, 1],
        ["run_1", "S_0", 12, -12, 0],
        ["run_2", "S_0", 13, -13, 1],
        ["run_3", "S_0", 14, -14, 1],
    ]
    derived_outputs_cols = ["idx", "Scenario", "times", "shark_attacks"]
    derived_outputs_1 = [
        ["run_0", "S_0", 2000, 3],
        ["run_0", "S_0", 2001, 6],
        ["run_0", "S_0", 2002, 10],
        ["run_1", "S_0", 2000, 4],
        ["run_1", "S_0", 2001, 7],
        ["run_1", "S_0", 2002, 11],
        ["run_2", "S_0", 2000, 2],
        ["run_2", "S_0", 2001, 5],
        ["run_2", "S_0", 2002, 9],
        ["run_3", "S_0", 2000, 1],
        ["run_3", "S_0", 2001, 2],
        ["run_3", "S_0", 2002, 3],
    ]
    derived_outputs_2 = [
        ["run_0", "S_0", 2000, 3.1],
        ["run_0", "S_0", 2001, 6.1],
        ["run_0", "S_0", 2002, 10.1],
        ["run_1", "S_0", 2000, 4.1],
        ["run_1", "S_0", 2001, 7.1],
        ["run_1", "S_0", 2002, 11.1],
        ["run_2", "S_0", 2000, 2.1],
        ["run_2", "S_0", 2001, 5.1],
        ["run_2", "S_0", 2002, 9.1],
        ["run_3", "S_0", 2000, 1.1],
        ["run_3", "S_0", 2001, 2.1],
        ["run_3", "S_0", 2002, 3.1],
    ]
    outputs_cols = ["idx", "Scenario", "times", "happy", "sad"]
    outputs_1 = [
        ["run_0", "S_0", 2000, 11, 11],
        ["run_0", "S_0", 2001, 12, 21],
        ["run_0", "S_0", 2002, 13, 31],
        ["run_1", "S_0", 2000, 21, 12],
        ["run_1", "S_0", 2001, 22, 22],
        ["run_1", "S_0", 2002, 23, 32],
        ["run_2", "S_0", 2000, 31, 13],
        ["run_2", "S_0", 2001, 32, 23],
        ["run_2", "S_0", 2002, 33, 33],
        ["run_3", "S_0", 2000, 41, 14],
        ["run_3", "S_0", 2001, 42, 24],
        ["run_3", "S_0", 2002, 43, 34],
    ]
    outputs_2 = [
        ["run_0", "S_0", 2000, 111, 211],
        ["run_0", "S_0", 2001, 112, 221],
        ["run_0", "S_0", 2002, 113, 231],
        ["run_1", "S_0", 2000, 121, 212],
        ["run_1", "S_0", 2001, 122, 222],
        ["run_1", "S_0", 2002, 123, 232],
        ["run_2", "S_0", 2000, 131, 213],
        ["run_2", "S_0", 2001, 132, 223],
        ["run_2", "S_0", 2002, 133, 233],
        ["run_3", "S_0", 2000, 141, 214],
        ["run_3", "S_0", 2001, 142, 224],
        ["run_3", "S_0", 2002, 143, 234],
    ]
    # Create dataframes to save to db
    mcmc_run_1_df = pd.DataFrame(mcmc_run_1, columns=mcmc_run_cols)
    mcmc_run_2_df = pd.DataFrame(mcmc_run_2, columns=mcmc_run_cols)
    derived_ouputs_1_df = pd.DataFrame(derived_outputs_1,
                                       columns=derived_outputs_cols)
    derived_ouputs_2_df = pd.DataFrame(derived_outputs_2,
                                       columns=derived_outputs_cols)
    outputs_1_df = pd.DataFrame(outputs_1, columns=outputs_cols)
    outputs_2_df = pd.DataFrame(outputs_2, columns=outputs_cols)

    # Connect to test databases
    target_db_path = os.path.join(tmp_path, "target.db")
    db_1_path = os.path.join(tmp_path, f"src-1.db")
    db_2_path = os.path.join(tmp_path, f"src-2.db")
    src_db_paths = [db_1_path, db_2_path]
    target_db = Database(target_db_path)
    src_1_db = Database(db_1_path)
    src_2_db = Database(db_2_path)

    # Save test data to databases
    mcmc_run_1_df.to_sql("mcmc_run", con=src_1_db.engine, index=False)
    mcmc_run_2_df.to_sql("mcmc_run", con=src_2_db.engine, index=False)
    derived_ouputs_1_df.to_sql("derived_outputs",
                               con=src_1_db.engine,
                               index=False)
    derived_ouputs_2_df.to_sql("derived_outputs",
                               con=src_2_db.engine,
                               index=False)
    outputs_1_df.to_sql("outputs", con=src_1_db.engine, index=False)
    outputs_2_df.to_sql("outputs", con=src_2_db.engine, index=False)

    collate_outputs(src_db_paths, target_db_path, num_runs=2)

    expected_mcmc_runs = [
        ["run_0", "S_0", 2, -2, 1],
        ["run_1", "S_0", 4, -4, 1],
        ["run_2", "S_0", 13, -13, 1],
        ["run_3", "S_0", 14, -14, 1],
    ]
    expected_derived_ouputs = [
        ["run_0", "S_0", 2000, 4],
        ["run_0", "S_0", 2001, 7],
        ["run_0", "S_0", 2002, 11],
        ["run_1", "S_0", 2000, 1],
        ["run_1", "S_0", 2001, 2],
        ["run_1", "S_0", 2002, 3],
        ["run_2", "S_0", 2000, 2.1],
        ["run_2", "S_0", 2001, 5.1],
        ["run_2", "S_0", 2002, 9.1],
        ["run_3", "S_0", 2000, 1.1],
        ["run_3", "S_0", 2001, 2.1],
        ["run_3", "S_0", 2002, 3.1],
    ]
    expected_outputs = [
        ["run_0", "S_0", 2000, 21, 12],
        ["run_0", "S_0", 2001, 22, 22],
        ["run_0", "S_0", 2002, 23, 32],
        ["run_1", "S_0", 2000, 41, 14],
        ["run_1", "S_0", 2001, 42, 24],
        ["run_1", "S_0", 2002, 43, 34],
        ["run_2", "S_0", 2000, 131, 213],
        ["run_2", "S_0", 2001, 132, 223],
        ["run_2", "S_0", 2002, 133, 233],
        ["run_3", "S_0", 2000, 141, 214],
        ["run_3", "S_0", 2001, 142, 224],
        ["run_3", "S_0", 2002, 143, 234],
    ]
    expected_mcmc_run_df = pd.DataFrame(expected_mcmc_runs,
                                        columns=mcmc_run_cols)
    expected_derived_ouputs_df = pd.DataFrame(expected_derived_ouputs,
                                              columns=derived_outputs_cols)
    expected_outputs_df = pd.DataFrame(expected_outputs, columns=outputs_cols)

    # Extract the outputs
    mcmc_df = target_db.query("mcmc_run")
    derived_outputs_df = target_db.query("derived_outputs")
    outputs_df = target_db.query("outputs")

    # Check that the outputs are correct
    assert_frame_equal(expected_mcmc_run_df, mcmc_df)
    assert_frame_equal(expected_derived_ouputs_df, derived_outputs_df)
    assert_frame_equal(expected_outputs_df, outputs_df)
Exemplo n.º 11
0
def test_unpivot_outputs(tmp_path):
    """
    Verify that unpivot_outputs works. 
    """
    out_db_path = os.path.join(tmp_path, "out.db")
    mock_model = get_mock_model(
        times=[2000, 2001, 2002, 2003, 2004, 2005],
        outputs=[
            [300.0, 300.0, 300.0, 33.0, 33.0, 33.0, 93.0, 39.0],
            [271.0, 300.0, 271.0, 62.0, 33.0, 62.0, 93.0, 69.0],
            [246.0, 300.0, 246.0, 88.0, 33.0, 88.0, 93.0, 89.0],
            [222.0, 300.0, 222.0, 111.0, 33.0, 111.0, 39.0, 119.0],
            [201.0, 300.0, 201.0, 132.0, 33.0, 132.0, 39.0, 139.0],
            [182.0, 300.0, 182.0, 151.0, 33.0, 151.0, 39.0, 159.0],
        ],
    )
    store_run_models([mock_model], out_db_path)
    out_db = Database(out_db_path)
    outputs_df = out_db.query("outputs")
    unpivoted_df = unpivot_outputs(outputs_df)
    expected_columns = [
        "idx",
        "Scenario",
        "times",
        "value",
        "age",
        "compartment",
        "mood",
    ]
    expected_data = [
        ["run_0", "S_0", 2000, 300.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2001, 271.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2002, 246.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2003, 222.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2004, 201.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2005, 182.0, "age_old", "susceptible", "mood_happy"],
        ["run_0", "S_0", 2000, 300.0, "age_old", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2001, 300.0, "age_old", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2002, 300.0, "age_old", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2003, 300.0, "age_old", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2004, 300.0, "age_old", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2005, 300.0, "age_old", "susceptible", "mood_sad"],
        [
            "run_0", "S_0", 2000, 300.0, "age_young", "susceptible",
            "mood_happy"
        ],
        [
            "run_0", "S_0", 2001, 271.0, "age_young", "susceptible",
            "mood_happy"
        ],
        [
            "run_0", "S_0", 2002, 246.0, "age_young", "susceptible",
            "mood_happy"
        ],
        [
            "run_0", "S_0", 2003, 222.0, "age_young", "susceptible",
            "mood_happy"
        ],
        [
            "run_0", "S_0", 2004, 201.0, "age_young", "susceptible",
            "mood_happy"
        ],
        [
            "run_0", "S_0", 2005, 182.0, "age_young", "susceptible",
            "mood_happy"
        ],
        ["run_0", "S_0", 2000, 33.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2001, 62.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2002, 88.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2003, 111.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2004, 132.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2005, 151.0, "age_young", "susceptible", "mood_sad"],
        ["run_0", "S_0", 2000, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2001, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2002, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2003, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2004, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2005, 33.0, "age_old", "infectious", "mood_happy"],
        ["run_0", "S_0", 2000, 33.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2001, 62.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2002, 88.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2003, 111.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2004, 132.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2005, 151.0, "age_old", "infectious", "mood_sad"],
        ["run_0", "S_0", 2000, 93.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2001, 93.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2002, 93.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2003, 39.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2004, 39.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2005, 39.0, "age_young", "infectious", "mood_happy"],
        ["run_0", "S_0", 2000, 39.0, "age_young", "infectious", "mood_sad"],
        ["run_0", "S_0", 2001, 69.0, "age_young", "infectious", "mood_sad"],
        ["run_0", "S_0", 2002, 89.0, "age_young", "infectious", "mood_sad"],
        ["run_0", "S_0", 2003, 119.0, "age_young", "infectious", "mood_sad"],
        ["run_0", "S_0", 2004, 139.0, "age_young", "infectious", "mood_sad"],
        ["run_0", "S_0", 2005, 159.0, "age_young", "infectious", "mood_sad"],
    ]
    expected_df = pd.DataFrame(expected_data, columns=expected_columns)
    assert_frame_equal(expected_df, unpivoted_df)
Exemplo n.º 12
0
def preprocess_social_mixing(input_db: Database, country_df):
    for location in LOCATIONS:
        for sheet_number, header_arg in SHEET_NUMBERS:
            sheet_name = f"MUestimates_{location}_{sheet_number}.xlsx"
            sheet_path = os.path.join(MIXING_DIRPATH, sheet_name)
            xl = pd.ExcelFile(sheet_path)
            sheet_names = xl.sheet_names
            iso3s = [get_iso3(n, country_df) for n in sheet_names]
            for idx, sheet_name in enumerate(sheet_names):
                iso3 = iso3s[idx]
                mix_df = pd.read_excel(xl,
                                       header=header_arg,
                                       sheet_name=sheet_name)
                if sheet_number == "2":
                    renames = {n - 1: f"X{n}" for n in range(1, 17)}
                    mix_df.rename(columns=renames, inplace=True)

                mix_df.insert(0, "location",
                              [location for _ in range(len(mix_df))])
                mix_df.insert(0, "iso3", [iso3 for _ in range(len(mix_df))])
                input_db.dump_df("social_mixing", mix_df)

    # Next gen social mixing
    original_mm = input_db.query("social_mixing")

    df = pd.read_csv(
        os.path.join(MIXING_DIRPATH, "synthetic_contacts_2020.csv"))
    df = df[df.setting == "overall"]
    df.drop(columns="setting", inplace=True)
    df.replace(
        {
            "0 to 4": "00 to 04",
            "5 to 9": "05 to 09",
            "all": "all_locations",
            "others": "other_locations",
        },
        inplace=True,
    )

    # The contactor is in j (columns) and the contactee is in i (rows)
    df = df.pivot_table(
        index=["iso3c", "location_contact", "age_cotactee"],
        columns="age_contactor",
        values="mean_number_of_contacts",
    )
    df = df.reset_index()
    df.drop(columns="age_cotactee", inplace=True)

    cols = list(df.columns[2:])
    new_col = ["X" + str(x) for x in range(1, len(cols) + 1)]
    replace_col = dict(zip(cols, new_col))
    df.rename(columns=replace_col, inplace=True)
    df.rename(columns={
        "iso3c": "iso3",
        "location_contact": "location"
    },
              inplace=True)

    iso3_diff = set(original_mm.iso3).difference(df.iso3)
    iso3_mask = original_mm.iso3.isin(iso3_diff)
    df = df.append(original_mm[iso3_mask], ignore_index=True)

    input_db.dump_df("social_mixing_2020", df)