Пример #1
0
def load_spei(parallel=True) -> None:
    """ Load SPEI """
    log.info("Started loading SPEI.")
    db.drop_schema("spei_v2")
    db.create_schema("spei_v2")
    df_pg_ug, df_m, df_ug_pgm = _get_id_dfs()

    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="spei", tempdir=tempdir)

        if parallel:
            with mp.Pool(processes=mp.cpu_count()) as pool:
                results = [
                    pool.apply_async(
                        func=_load_spei_from_path,
                        args=(path, df_pg_ug, df_m, df_ug_pgm),
                    ) for path in paths
                ]
                _ = [result.get() for result in results]
        else:
            for path in paths:
                _load_spei_from_path(path, df_pg_ug, df_m, df_ug_pgm)

    _stage_spei()
    log.info("Finished loading SPEI.")
Пример #2
0
def load_fvp():
    """ Load FVP data """
    log.info("Started loading FVP")
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir)
        df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv"))

    df = df.drop(columns=["Conflict"])
    df = df.rename(columns=lambda col: col.lower())
    df = df.set_index(["year", "gwno"])

    spec = io.load_yaml(
        path=os.path.join(os.path.dirname(__file__), "spec.yaml"))
    df = df[spec["cols"]]

    log.debug("Fetching df_keys")
    query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;"
    df = df.join(
        db.query_to_df(query=query).sort_values(
            by="country_id",
            ascending=False).drop_duplicates(subset=["gwno"]).set_index(
                ["gwno"]))

    log.debug("Joining to skeleton")
    df = db.db_to_df(
        fqtable="skeleton.cy_global",
        ids=["year", "country_id"],
        cols=["year", "country_id"],
    ).join(df.reset_index().set_index(["year", "country_id"]), how="left")

    df = df.drop(columns=["gwno"])

    # Add consistent fvp_ prefix
    df = df.rename(
        columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}")
    df = df.sort_index(axis=1).sort_index(axis=0)

    # Push raw
    db.create_schema("fvp_v2")
    db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df)

    # Extrapolate before imputing
    df = missing.extrapolate(df)

    # Impute and push
    for i, df_imp in enumerate(
            missing.impute_mice_generator(
                df=df,
                n_imp=10,
                estimator=DecisionTreeRegressor(max_features="sqrt"),
                parallel=True,
            )):
        db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}")

    log.info("Fininshed loading FVP")
Пример #3
0
def load_cdum() -> None:
    """ Load country dummies """

    df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"])
    df = df.reset_index().rename(columns={"id": "country_id"})
    df["to_dummy"] = df["country_id"]
    df = df.set_index(["country_id"])
    df = pd.get_dummies(df.to_dummy, prefix="cdum")
    db.drop_schema("cdum")
    db.create_schema("cdum")
    db.df_to_db(fqtable="cdum.c", df=df)
Пример #4
0
def load_reign() -> None:
    """ Load reign """
    log.info("Started loading reign.")

    spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir)
        path_csv = [path for path in paths if path.endswith(".csv")].pop()
        df = io.csv_to_df(path=path_csv)

    df = fix_ccodes(df, spec)
    df = encode_govt_dummies(df)

    df = df.set_index(["year", "month", "ccode"])
    df = df.join(
        db.query_to_df(query="""
                SELECT id AS country_id, gwcode AS ccode
                FROM staging.country WHERE gweyear=2016;
                """).set_index(["ccode"]))
    df = df.join(
        db.query_to_df(query="""
            SELECT id AS month_id, year_id AS year, month FROM staging.month;
            """).set_index(["year", "month"]))
    df = df.reset_index().set_index(["month_id", "country_id"])
    df = df.drop(
        columns=["year", "month", "ccode", "country", "government", "leader"])

    df_skeleton = db.db_to_df(
        fqtable="skeleton.cm_global",
        cols=["month_id", "country_id"],
        ids=["month_id", "country_id"],
    )
    len_skel = len(df_skeleton)
    df = df_skeleton.join(df, how="left")
    if not len(df) == len_skel:
        raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}")

    df = df.add_prefix("reign_")

    db.drop_schema("reign_v2")
    db.create_schema("reign_v2")
    db.df_to_db(df=df, fqtable="reign_v2.cm_unimp")

    db.df_to_db(
        df=missing.fill_groups_with_time_means(missing.extrapolate(df)),
        fqtable="reign_v2.cm_extrapolated",
    )

    log.info("Finished loading reign.")
Пример #5
0
def load_ged() -> None:
    """ Collect imputed and unimputed GED """

    log.info("Started loading GED.")

    load_legacy_ged("20.9.4", 484, 484)  # 2020-04

    db.drop_schema("ged")
    db.create_schema("ged")
    db.execute_query(
        query=io.read_file(
            path=os.path.join(os.path.dirname(__file__), "ged.sql")
        )
    )
    log.info("Finished loading GED.")
Пример #6
0
def load_wdi() -> None:
    """ Load WDI to database """

    log.info("Started loading WDI.")
    df = _load_and_stage_wdi()

    schema = "wdi_202005"
    db.drop_schema(schema)
    db.create_schema(schema)

    # Push completely raw but staged data
    log.debug("Done preparing raw WDI, pushing.")
    fqtables = [f"{schema}.cy_unimp_part_{p}" for p in [1, 2]]
    db.df_to_split_db(df=df, fqtables=fqtables)

    _fill_and_push_wdi(df=df, schema=schema, n_imp=5)
    log.info("Finished loading WDI.")
Пример #7
0
def load_acled() -> None:
    """ Code that brings acled to staging yet to be merged """

    log.info("Started loading ACLED.")

    load_legacy_acled(
        from_date="2020-01-01", from_month_id=483, to_month_id=484
    )

    db.drop_schema("acled")
    db.create_schema("acled")

    db.execute_query(
        query=io.read_file(
            path=os.path.join(os.path.dirname(__file__), "acled.sql")
        )
    )
    log.info("Finished loading ACLED.")
Пример #8
0
def load_icgcw() -> None:
    """ Load ICGCW """
    log.info("Starting ICGCW import")

    spec = io.load_yaml(
        path=os.path.join(os.path.dirname(__file__), "spec.yaml"))
    # Get all the entries as list of dicts
    entries = load_and_parse_entries(parallel=True)
    entries = drop_unnamed_entries(entries)

    # Some renames depend on splits so we do splits twice
    entries = split_multi_country_entries(entries)
    entries = do_renames(entries, spec)
    entries = split_multi_country_entries(entries)

    entries = drop_drops(entries, drops=spec["drops"])

    debug_log_unique_names(entries)

    df = pd.DataFrame(entries)
    df = set_dates(df)
    df = drop_duplicate_country_months(df)
    df = df.set_index(["year", "month", "name"])
    cols = [
        "alerts",
        "opportunities",
        "deteriorated",
        "improved",
        "unobserved",
    ]
    df = df[cols]

    df_c = (db.query_to_df(
        "SELECT id AS country_id, name FROM staging.country;").sort_values(
            by=["country_id"],
            ascending=False).drop_duplicates(subset=["name"]).set_index(
                ["name"]))
    df_m = (db.query_to_df("""
            SELECT id AS month_id, year_id AS year, month  FROM staging.month;
            """).set_index(["year", "month"]).sort_index())
    df_skeleton = db.db_to_df(
        fqtable="skeleton.cm_global",
        cols=["month_id", "country_id"],
        ids=["month_id", "country_id"],
    )

    df = (df.join(df_c).join(df_m).dropna(subset=["country_id"]).set_index(
        ["month_id", "country_id"]).sort_index())
    df = df_skeleton.join(df, how="left")

    df = missing.extrapolate(df)
    df["unobserved"] = df["unobserved"].fillna(1)
    df = df.fillna(0)

    # orer cols and rows
    df = df.sort_index(axis=1).sort_index()
    df = df.add_prefix("icgcw_")

    # @TODO: Change this to icgcw without v2 once we drop flat
    schema = "icgcw_v2"
    db.drop_schema(schema)
    db.create_schema(schema)
    db.df_to_db(fqtable=f"{schema}.cm", df=df)