def _stage_spei(): """ Stage SPEI """ log.debug("Started staging SPEI") db.execute_query(query=io.read_file( os.path.join(os.path.dirname(__file__), "stage.sql"))) db.execute_query(query=io.read_file( os.path.join(os.path.dirname(__file__), "cleanup.sql"))) log.debug("Finished staging SPEI")
def _get_id_dfs() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Get dataframes with ids from database """ db.execute_query(query=io.read_file( path=os.path.join(os.path.dirname(__file__), "pg_ug.sql"))) df_pg_ug = db.db_to_df(fqtable="spei_v2.pg_ug", ids=["pg_id"]) df_m = (db.db_to_df(fqtable="staging.month", cols=["id"], ids=["year_id", "month"]).reset_index().rename(columns={ "year_id": "year", "id": "month_id" }).set_index(["year", "month"])) df_ug_pgm = (db.db_to_df( fqtable="staging.priogrid_month", cols=["id"], ids=["priogrid_gid", "month_id"], ).reset_index().rename(columns={ "id": "priogrid_month_id", "priogrid_gid": "pg_id" }).set_index(["pg_id", "month_id"]).join(df_pg_ug).reset_index().set_index( ["ug_id", "month_id"])[["pg_id", "priogrid_month_id"]]) return df_pg_ug, df_m, df_ug_pgm
def _prepare_acled(): log.debug("Started _prepare_acled()") # This was pure sql, not even a parametrised query. db.execute_query(query=io.read_file( path=os.path.join(os.path.dirname(__file__), "prepare_acled.sql"))) log.debug("Finished _prepare_acled()")
def _prepare_ged() -> None: """ Recreates preflight.ged_attached and preflight.ged_attached_full """ # Moved into .sql file in this dir. log.debug(f"Preparing preflight.ged_attached(_full)") query = io.read_file( path=os.path.join(os.path.dirname(__file__), "prepare_ged.sql")) db.execute_query(query) log.debug(f"Done preflight.ged_attached(_full)")
def parse_page(path: str) -> List[Dict[Any, Any]]: """ CrisisWatch parser using bs4. Appends to dataframe and returns df """ # pylint: disable=too-many-locals soup = BeautifulSoup(io.read_file(path), "html.parser") # loop over blocks search = { "class": "c-crisiswatch-entry [ o-container o-container--m u-pr ]" } entries = [] for block in soup.find_all("div", search): # remove whitespace titles countryname = block.find("h3").text # remove unnecessary spacing # countryname = re.sub("^\s+|\s+$", "", countryname, flags=re.UNICODE) countryname = re.sub(r"^s+|s+$", "", countryname, flags=re.UNICODE) countryname = countryname.strip() entrydate = block.find("time").text # entries may have no text, so adding a try here try: cls_tag = {"class": "o-crisis-states__detail [ u-ptserif u-fs18 ]"} entrytext = block.find("div", cls_tag).text entrytext = entrytext.replace("\n\t", "") except AttributeError: entrytext = "" # prepare dummies using list tblock = block.find("h3") updates = list(tblock.find_all("use")) deteriorated = 1 if "#deteriorated" in str(updates) else 0 improved = 1 if "#improved" in str(updates) else 0 alert = 1 if "#risk-alert" in str(updates) else 0 resolution = 1 if "#resolution" in str(updates) else 0 unobserved = 0 entry_data = { "date": entrydate, "name": countryname, "alerts": alert, "opportunities": resolution, "deteriorated": deteriorated, "improved": improved, "unobserved": unobserved, "text": entrytext, } entries.append(entry_data) log.debug(f"Read {len(entries)} entries from {path}") return entries
def load_ged() -> None: """ Collect imputed and unimputed GED """ log.info("Started loading GED.") load_legacy_ged("20.9.4", 484, 484) # 2020-04 db.drop_schema("ged") db.create_schema("ged") db.execute_query( query=io.read_file( path=os.path.join(os.path.dirname(__file__), "ged.sql") ) ) log.info("Finished loading GED.")
def load_acled() -> None: """ Code that brings acled to staging yet to be merged """ log.info("Started loading ACLED.") load_legacy_acled( from_date="2020-01-01", from_month_id=483, to_month_id=484 ) db.drop_schema("acled") db.create_schema("acled") db.execute_query( query=io.read_file( path=os.path.join(os.path.dirname(__file__), "acled.sql") ) ) log.info("Finished loading ACLED.")