Exemplo n.º 1
0
def prune_chain(source_db_path: str, target_db_path: str):
    """
    Read the model outputs from a database and removes output data that is not MLE.
    This is an operation applied to each chain's database.
    """
    logger.info("Pruning %s into %s", source_db_path, target_db_path)
    source_db = get_database(source_db_path)
    target_db = get_database(target_db_path)

    # Find the maximum accepted loglikelihood for all runs
    mcmc_run_df = source_db.query("mcmc_run")
    mle_run_df = find_mle_run(mcmc_run_df)
    mle_run_id = mle_run_df.run.iloc[0]
    mle_chain_id = mle_run_df.chain.iloc[0]
    # Copy tables over, pruning some.
    tables_to_copy = source_db.table_names()
    for table_name in tables_to_copy:
        table_df = source_db.query(table_name)
        if table_name == "outputs":
            # Drop everything except the MLE run
            logger.info(
                "Pruning outputs so that it only contains max likelihood runs")
            mle_mask = (table_df["run"] == mle_run_id) & (table_df["chain"]
                                                          == mle_chain_id)
            max_ll_table_df = table_df[mle_mask]
            target_db.dump_df(table_name, max_ll_table_df)
        elif table_name:
            # Copy table over (mcmc_run, mcmc_params, derived_outputs)
            logger.info("Copying %s", table_name)
            target_db.dump_df(table_name, table_df)

    logger.info("Finished pruning %s into %s", source_db_path, target_db_path)
Exemplo n.º 2
0
def save_mle_params(database_path: str, target_path: str):
    """
    Saves the MCMC parameters for the MLE run as a YAML file in the target path.
    """
    db = get_database(database_path)
    mcmc_df = db.query("mcmc_run")
    param_df = db.query("mcmc_params")
    mle_params = process.find_mle_params(mcmc_df, param_df)
    with open(target_path, "w") as f:
        yaml.dump(mle_params, f)
Exemplo n.º 3
0
def collate_databases(src_db_paths: List[str],
                      target_db_path: str,
                      tables=None):
    """
    Collate the output of many calibration databases into a single database.
    Run names are renamed to be ascending in the final database.
    """
    logger.info("Collating db outputs into %s", target_db_path)
    target_db = get_database(target_db_path)
    for db_path in src_db_paths:
        logger.info("Reading data from %s", db_path)
        source_db = get_database(db_path)
        for table_name in source_db.table_names():
            if tables and table_name not in tables:
                logger.info("Skipping table %s", table_name)
                continue

            logger.info("Copying table %s", table_name)
            table_df = source_db.query(table_name)
            target_db.dump_df(table_name, table_df)

    logger.info("Finished collating db outputs into %s", target_db_path)
Exemplo n.º 4
0
def add_uncertainty_quantiles(database_path: str, targets: dict):
    """
    Add an uncertainty table to a given database, based on mcmc_run and derived_outputs.
    The table will have columns scenario/type/time/quantile/value.
    """
    logger.info("Calculating uncertainty for %s", database_path)
    db = get_database(database_path)
    if "uncertainty" in db.table_names():
        logger.info(
            "Deleting existing uncertainty table in %s",
            database_path,
        )
        db.engine.execute(f"DELETE FROM uncertainty")

    logger.info("Loading data into memory")
    mcmc_df = db.query("mcmc_run")
    do_df = db.query("derived_outputs")
    logger.info("Calculating uncertainty")
    uncertainty_df = calculate_mcmc_uncertainty(mcmc_df, do_df, targets)
    db.dump_df("uncertainty", uncertainty_df)
    logger.info("Finished writing uncertainties")
Exemplo n.º 5
0
def run_full_model_for_chain(
    run_id: str, src_db_path: str, chain_id: int, burn_in: int, sample_size: int, quiet: bool
):
    """
    Run the full model (all time steps, all scenarios) for a subset of accepted calibration runs.
    It works like this:
        - We start off with a calibration chain of length C
        - We apply "burn in" by throwing away the first B iterations of the chain, leaving us with C - B iterations
        - We then sample runs from the chain using a "sample size" parameter S by calculating N = floor(C - B / S)
          once we know N, we then start from the end of the chain, working backwards, and select every Nth run
              if a run is accepted then we select it
              if a run is not accepted, we select the first accepted run that precedes it

    Once we've sampled all the runs we need, then we re-run them in full, including all their scenarios.
    """
    set_logging_config(not quiet, chain_id)
    msg = "Running full models for chain %s with burn-in of %s and sample size of %s."
    logger.info(msg, chain_id, burn_in, sample_size)
    try:
        app_region = get_app_region(run_id)
        msg = f"Running the {app_region.app_name} {app_region.region_name} model"
        logger.info(msg)

        dest_db_path = os.path.join(FULL_RUN_DATA_DIR, f"chain-{chain_id}")
        src_db = get_database(src_db_path)
        dest_db = get_database(dest_db_path)

        # Burn in MCMC parameter history and copy it across so it can be used in visualizations downstream.
        # Don't apply sampling to it - we want to see the whole parameter space that was explored.
        mcmc_params_df = src_db.query(Table.PARAMS)
        burn_mask = mcmc_params_df["run"] >= burn_in
        dest_db.dump_df(Table.PARAMS, mcmc_params_df[burn_mask])

        # Add some extra columns to MCMC run history to track sampling.
        mcmc_run_df = src_db.query(Table.MCMC)
        num_runs = len(mcmc_run_df)
        msg = f"Tried to burn {burn_in} runs with sample size {sample_size}, but there are only {num_runs}"
        assert num_runs > (burn_in + sample_size), msg

        # Sampled column tells us whether a run will be sampled.
        sampled = []
        sample_step = max(1, (num_runs - burn_in) // sample_size)
        logger.info("Using a sample step of %s", sample_step)
        for idx, mcmc_run in mcmc_run_df.iterrows():
            should_sample = 1 if (num_runs - idx - 1) % sample_step == 0 else 0
            sampled.append(should_sample)

        mcmc_run_df["sampled"] = sampled

        # Parent column tells us which accepted run precedes this run
        parents = []
        i_row = 0  # FIXME: This is a temporary patch.
        for _, mcmc_run in mcmc_run_df.iterrows():
            if mcmc_run["accept"] or i_row == 0:
                parent = int(mcmc_run["run"])

            parents.append(parent)
            i_row += 1

        mcmc_run_df["parent"] = parents

        # Burn in MCMC run history.
        burn_mask = mcmc_run_df["run"] >= burn_in
        burned_runs_str = ", ".join([str(i) for i in mcmc_run_df[~burn_mask].run])
        mcmc_run_df = mcmc_run_df[burn_mask].copy()
        num_remaining = len(mcmc_run_df)
        logger.info(
            "Burned %s of %s MCMC runs leaving %s remaining.", burn_in, num_runs, num_remaining
        )

        logger.info("Burned MCMC runs %s", burned_runs_str)
        dest_db.dump_df(Table.MCMC, mcmc_run_df)

        # Figure out which model runs to actually re-run.
        sampled_run_ids = mcmc_run_df[mcmc_run_df["sampled"] == 1].parent.unique().tolist()

        # Also include the MLE
        mle_df = db.process.find_mle_run(mcmc_run_df)
        mle_run_id = mle_df["run"].iloc[0]
        logger.info("Including MLE run %s", mle_run_id)
        sampled_run_ids.append(mle_run_id)
        sampled_run_ids = sorted(list(set(sampled_run_ids)))
        logger.info(
            "Running full model for %s sampled runs %s", len(sampled_run_ids), sampled_run_ids
        )

        outputs = []
        derived_outputs = []
        for sampled_run_id in sampled_run_ids:
            try:
                mcmc_run = mcmc_run_df.loc[mcmc_run_df["run"] == sampled_run_id].iloc[0]
            except IndexError:
                # This happens when we try to sample a parent run that has been burned, we log this and ignore it.
                logger.warn("Skipping (probably) burned parent run id %s", sampled_run_id)
                continue

            run_id = mcmc_run["run"]
            chain_id = mcmc_run["chain"]
            assert mcmc_run["accept"]
            logger.info("Running full model for MCMC run %s", run_id)
            param_updates = db.load.load_mcmc_params(dest_db, run_id)
            update_func = lambda ps: update_params(ps, param_updates)
            with Timer("Running model scenarios"):
                scenarios = app_region.build_and_run_scenarios(update_func=update_func)

            run_id = int(run_id)
            chain_id = int(chain_id)

            with Timer("Processing model outputs"):
                processed_outputs = app_region.process_scenario_outputs(scenarios, run_id, chain_id)
                outputs.append(processed_outputs[Table.OUTPUTS])
                derived_outputs.append(processed_outputs[Table.DERIVED])

        with Timer("Saving model outputs to the database"):
            final_outputs = {}
            final_outputs[Table.OUTPUTS] = pd.concat(outputs, copy=False, ignore_index=True)
            final_outputs[Table.DERIVED] = pd.concat(derived_outputs, copy=False, ignore_index=True)
            db.store.save_model_outputs(dest_db, **final_outputs)

    except Exception:
        logger.exception("Full model run for chain %s failed", chain_id)
        raise

    logger.info("Finished running full models for chain %s.", chain_id)
    return chain_id
Exemplo n.º 6
0
def powerbi_postprocess(source_db_path: str, target_db_path: str, run_id: str):
    """
    Read the model outputs from a database and then convert them into a form
    that is readable by our PowerBI dashboard.
    Save the converted data into its own database.
    """
    source_db = get_database(source_db_path)
    target_db = get_database(target_db_path)
    tables_to_copy = [t for t in source_db.table_names() if t != "outputs"]
    for table_name in tables_to_copy:
        logger.info("Copying %s", table_name)
        table_df = source_db.query(table_name)
        if table_name == "uncertainty":
            # Rename "time" field to "times"
            table_df.rename(columns={"time": "times"})

        target_db.dump_df(table_name, table_df)

    app_name, region_name, timestamp, git_commit = read_run_id(run_id)

    # Add build metadata table
    build_key = f"{timestamp}-{git_commit}"
    logger.info("Adding 'build' metadata table with key %s", build_key)
    build_df = pd.DataFrame.from_dict({
        "build_key": [build_key],
        "app_name": [app_name],
        "region_name": [region_name]
    })
    target_db.dump_df("build", build_df)

    # Add scenario metadata table
    logger.info("Adding 'scenario' metadata table")
    params = load_params(app_name, region_name)
    # Add default scenario
    scenario_data = [{
        "scenario": 0,
        "start_time": int(params["default"]["time"]["start"]),
        "description": params["default"].get("description", ""),
    }]
    for sc_idx, sc_params in params["scenarios"].items():
        sc_datum = {
            "scenario": int(sc_idx),
            "start_time": int(sc_params["time"]["start"]),
            "description": sc_params.get("description", ""),
        }
        scenario_data.append(sc_datum)

    scenario_df = pd.DataFrame(scenario_data)
    target_db.dump_df("scenario", scenario_df)

    # Add calibration targets
    logger.info("Adding 'targets' table")
    targets = load_targets(app_name, region_name)
    targets_data = []
    for target in targets.values():
        for t, v in zip(target["times"], target["values"]):
            t_datum = {
                "key": target["output_key"],
                "times": t,
                "value": v,
            }
            targets_data.append(t_datum)

    targets_df = pd.DataFrame(targets_data)
    target_db.dump_df("targets", targets_df)

    logger.info("Converting outputs to PowerBI format")
    outputs_df = source_db.query("outputs")
    pbi_outputs_df = unpivot_outputs(outputs_df)
    target_db.dump_df("powerbi_outputs", pbi_outputs_df)
    logger.info("Finished creating PowerBI output database at %s",
                target_db_path)