def load_spei(parallel=True) -> None: """ Load SPEI """ log.info("Started loading SPEI.") db.drop_schema("spei_v2") db.create_schema("spei_v2") df_pg_ug, df_m, df_ug_pgm = _get_id_dfs() with tempfile.TemporaryDirectory() as tempdir: paths = common.get_files_latest_fetch(name="spei", tempdir=tempdir) if parallel: with mp.Pool(processes=mp.cpu_count()) as pool: results = [ pool.apply_async( func=_load_spei_from_path, args=(path, df_pg_ug, df_m, df_ug_pgm), ) for path in paths ] _ = [result.get() for result in results] else: for path in paths: _load_spei_from_path(path, df_pg_ug, df_m, df_ug_pgm) _stage_spei() log.info("Finished loading SPEI.")
def load_fvp(): """ Load FVP data """ log.info("Started loading FVP") with tempfile.TemporaryDirectory() as tempdir: _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir) df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv")) df = df.drop(columns=["Conflict"]) df = df.rename(columns=lambda col: col.lower()) df = df.set_index(["year", "gwno"]) spec = io.load_yaml( path=os.path.join(os.path.dirname(__file__), "spec.yaml")) df = df[spec["cols"]] log.debug("Fetching df_keys") query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;" df = df.join( db.query_to_df(query=query).sort_values( by="country_id", ascending=False).drop_duplicates(subset=["gwno"]).set_index( ["gwno"])) log.debug("Joining to skeleton") df = db.db_to_df( fqtable="skeleton.cy_global", ids=["year", "country_id"], cols=["year", "country_id"], ).join(df.reset_index().set_index(["year", "country_id"]), how="left") df = df.drop(columns=["gwno"]) # Add consistent fvp_ prefix df = df.rename( columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}") df = df.sort_index(axis=1).sort_index(axis=0) # Push raw db.create_schema("fvp_v2") db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df) # Extrapolate before imputing df = missing.extrapolate(df) # Impute and push for i, df_imp in enumerate( missing.impute_mice_generator( df=df, n_imp=10, estimator=DecisionTreeRegressor(max_features="sqrt"), parallel=True, )): db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}") log.info("Fininshed loading FVP")
def load_cdum() -> None: """ Load country dummies """ df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"]) df = df.reset_index().rename(columns={"id": "country_id"}) df["to_dummy"] = df["country_id"] df = df.set_index(["country_id"]) df = pd.get_dummies(df.to_dummy, prefix="cdum") db.drop_schema("cdum") db.create_schema("cdum") db.df_to_db(fqtable="cdum.c", df=df)
def load_reign() -> None: """ Load reign """ log.info("Started loading reign.") spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) with tempfile.TemporaryDirectory() as tempdir: paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir) path_csv = [path for path in paths if path.endswith(".csv")].pop() df = io.csv_to_df(path=path_csv) df = fix_ccodes(df, spec) df = encode_govt_dummies(df) df = df.set_index(["year", "month", "ccode"]) df = df.join( db.query_to_df(query=""" SELECT id AS country_id, gwcode AS ccode FROM staging.country WHERE gweyear=2016; """).set_index(["ccode"])) df = df.join( db.query_to_df(query=""" SELECT id AS month_id, year_id AS year, month FROM staging.month; """).set_index(["year", "month"])) df = df.reset_index().set_index(["month_id", "country_id"]) df = df.drop( columns=["year", "month", "ccode", "country", "government", "leader"]) df_skeleton = db.db_to_df( fqtable="skeleton.cm_global", cols=["month_id", "country_id"], ids=["month_id", "country_id"], ) len_skel = len(df_skeleton) df = df_skeleton.join(df, how="left") if not len(df) == len_skel: raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}") df = df.add_prefix("reign_") db.drop_schema("reign_v2") db.create_schema("reign_v2") db.df_to_db(df=df, fqtable="reign_v2.cm_unimp") db.df_to_db( df=missing.fill_groups_with_time_means(missing.extrapolate(df)), fqtable="reign_v2.cm_extrapolated", ) log.info("Finished loading reign.")
def load_ged() -> None: """ Collect imputed and unimputed GED """ log.info("Started loading GED.") load_legacy_ged("20.9.4", 484, 484) # 2020-04 db.drop_schema("ged") db.create_schema("ged") db.execute_query( query=io.read_file( path=os.path.join(os.path.dirname(__file__), "ged.sql") ) ) log.info("Finished loading GED.")
def load_wdi() -> None: """ Load WDI to database """ log.info("Started loading WDI.") df = _load_and_stage_wdi() schema = "wdi_202005" db.drop_schema(schema) db.create_schema(schema) # Push completely raw but staged data log.debug("Done preparing raw WDI, pushing.") fqtables = [f"{schema}.cy_unimp_part_{p}" for p in [1, 2]] db.df_to_split_db(df=df, fqtables=fqtables) _fill_and_push_wdi(df=df, schema=schema, n_imp=5) log.info("Finished loading WDI.")
def load_acled() -> None: """ Code that brings acled to staging yet to be merged """ log.info("Started loading ACLED.") load_legacy_acled( from_date="2020-01-01", from_month_id=483, to_month_id=484 ) db.drop_schema("acled") db.create_schema("acled") db.execute_query( query=io.read_file( path=os.path.join(os.path.dirname(__file__), "acled.sql") ) ) log.info("Finished loading ACLED.")
def load_icgcw() -> None: """ Load ICGCW """ log.info("Starting ICGCW import") spec = io.load_yaml( path=os.path.join(os.path.dirname(__file__), "spec.yaml")) # Get all the entries as list of dicts entries = load_and_parse_entries(parallel=True) entries = drop_unnamed_entries(entries) # Some renames depend on splits so we do splits twice entries = split_multi_country_entries(entries) entries = do_renames(entries, spec) entries = split_multi_country_entries(entries) entries = drop_drops(entries, drops=spec["drops"]) debug_log_unique_names(entries) df = pd.DataFrame(entries) df = set_dates(df) df = drop_duplicate_country_months(df) df = df.set_index(["year", "month", "name"]) cols = [ "alerts", "opportunities", "deteriorated", "improved", "unobserved", ] df = df[cols] df_c = (db.query_to_df( "SELECT id AS country_id, name FROM staging.country;").sort_values( by=["country_id"], ascending=False).drop_duplicates(subset=["name"]).set_index( ["name"])) df_m = (db.query_to_df(""" SELECT id AS month_id, year_id AS year, month FROM staging.month; """).set_index(["year", "month"]).sort_index()) df_skeleton = db.db_to_df( fqtable="skeleton.cm_global", cols=["month_id", "country_id"], ids=["month_id", "country_id"], ) df = (df.join(df_c).join(df_m).dropna(subset=["country_id"]).set_index( ["month_id", "country_id"]).sort_index()) df = df_skeleton.join(df, how="left") df = missing.extrapolate(df) df["unobserved"] = df["unobserved"].fillna(1) df = df.fillna(0) # orer cols and rows df = df.sort_index(axis=1).sort_index() df = df.add_prefix("icgcw_") # @TODO: Change this to icgcw without v2 once we drop flat schema = "icgcw_v2" db.drop_schema(schema) db.create_schema(schema) db.df_to_db(fqtable=f"{schema}.cm", df=df)