def qc(country_iso3, config=None): if config is None: config = Config() parameters = config.parameters(country_iso3) main_dir = os.path.join(config.MAIN_OUTPUT_DIR, country_iso3) check_graph(config, parameters, country_iso3, main_dir) check_npis(config, country_iso3, main_dir)
def graph(country_iso3, end_date, config=None): if config is None: config = Config() parameters = config.parameters(country_iso3) logger.info(f"Creating graph for {country_iso3}") main_dir = os.path.join(config.MAIN_OUTPUT_DIR, country_iso3) # Initialize graph with mobility edges mobility_csv = os.path.join( main_dir, config.MOBILITY_DIR, config.MOBILITY_FILENAME.format(country_iso3=country_iso3)) G = initialize_with_mobility(mobility_csv) G.graph["country"] = country_iso3 # Add exposure G = add_exposure(G, main_dir, country_iso3, parameters["admin"], config) # Add COVID cases G = add_covid(G, main_dir, country_iso3, end_date, config) # Add WHO data G = add_WHO_data(G, country_iso3, end_date, parameters, config) # Add vulnerability G = add_vulnerability(G, main_dir, country_iso3, config) # Add contact matrix add_contact_matrix(G, parameters["contact_matrix"], config) input_shp = os.path.join( config.INPUT_DIR, country_iso3, config.SHAPEFILE_DIR, parameters["admin"]["directory"], f'{parameters["admin"]["directory"]}.shp', ) # Add general attributes to ensure compatibility with Bucky requirements G = add_general_attributes(G, country_iso3, input_shp) # Write out data = nx.readwrite.json_graph.node_link_data(G) outdir = os.path.join(main_dir, config.GRAPH_OUTPUT_DIR) Path(outdir).mkdir(parents=True, exist_ok=True) outfile_json = os.path.join( main_dir, config.GRAPH_OUTPUT_DIR, config.GRAPH_OUTPUT_FILE_JSON.format(country_iso3)) outfile_pickle = os.path.join( main_dir, config.GRAPH_OUTPUT_DIR, config.GRAPH_OUTPUT_FILE_PICKLE.format(country_iso3)) with open(outfile_json, "w") as f: json.dump(data, f, indent=2) logger.info(f"Wrote out to {outfile_json}") with open(outfile_pickle, 'wb') as f: pickle.dump(G, f)
def npis(country_iso3, update_npi_list_arg, create_final_list_arg, download_acaps_arg=False, config=None): # Get config file if config is None: config = Config() parameters = config.parameters(country_iso3) # Get NPIs for each country if update_npi_list_arg: update_npi_list(config, parameters, country_iso3, download_acaps_arg) if create_final_list_arg: create_final_list(config, parameters, country_iso3)
def graph(country_iso3, mobility_csv, config=None): if config is None: config = Config() parameters = config.parameters[country_iso3] logger.info(f"Creating graph for {country_iso3}") main_dir = os.path.join(config.MAIN_OUTPUT_DIR, country_iso3) # Initialize graph with mobility edges G = initialize_with_mobility(mobility_csv) G.graph["country"] = country_iso3 # Add exposure G = add_exposure(G, main_dir, country_iso3, parameters["admin"], config) # Add COVID cases G = add_covid(G, main_dir, country_iso3, config) # Add vulnerability G = add_vulnerability(G, main_dir, country_iso3, config) # Add contact matrix add_contact_matrix(G, parameters["contact_matrix"], config) # Write out data = nx.readwrite.json_graph.node_link_data(G) outdir = os.path.join(main_dir, config.GRAPH_OUTPUT_DIR) Path(outdir).mkdir(parents=True, exist_ok=True) outfile = os.path.join(main_dir, config.GRAPH_OUTPUT_DIR, config.GRAPH_OUTPUT_FILE.format(country_iso3)) with open(outfile, "w") as f: json.dump(data, f, indent=2) logger.info(f"Wrote out to {outfile}")
def mobility(country_iso3, read_in_crossings=True, read_in_distances=True, config=None): # Read in the files logger.info(f'Running for country {country_iso3}') # Get config and parameters if config is None: config = Config() parameters = config.parameters(country_iso3) # Make the output directory if it doesn't exist output_dir = os.path.join(config.MAIN_OUTPUT_DIR, country_iso3, config.MOBILITY_DIR) Path(output_dir).mkdir(parents=True, exist_ok=True) # Load admin regions df_adm = load_adm(country_iso3, config, parameters) # Read in population file df_pop = gpd.read_file(exposure.get_output_filename(country_iso3, config)) if read_in_crossings: logger.info('Reading in saved roads file') df_roads = gpd.read_file(os.path.join(output_dir, config.CROSSINGS_FILENAME)) for cname in ['crossings', 'crossing_pairs']: df_roads[cname] = df_roads[cname].apply(ast.literal_eval) else: df_borders = get_borders(df_adm) df_roads = load_roads(country_iso3, config, parameters['mobility'], df_borders) df_roads = get_road_crossings(df_roads, df_adm) df_roads_out = df_roads.copy() for cname in ['crossings', 'crossing_pairs']: df_roads_out[cname] = df_roads_out[cname].apply(str) df_roads_out.to_file(os.path.join(output_dir, config.CROSSINGS_FILENAME), driver='GPKG') # Get centroid dist if read_in_distances: logger.info('Reading in saved distances file') df_dist = pd.read_csv(os.path.join(output_dir, config.DISTANCES_FILENAME)) else: df_dist = get_centroid_dist(df_adm) df_dist.to_csv(os.path.join(output_dir, config.DISTANCES_FILENAME), index=False) # Count the number of crossings df_dist = count_crossings(df_dist, df_roads, config) # Create matrix and plot df_matrix = create_matrix(df_adm, df_dist, parameters['mobility']['scaling_factor'], df_pop) fig = plot_final_hist(df_matrix, country_iso3) # Save matrix and plot df_matrix.to_csv(os.path.join(output_dir, config.MOBILITY_FILENAME.format(country_iso3=country_iso3))) fig.savefig(os.path.join(output_dir, config.MOBILITY_FIGNAME.format(country_iso3=country_iso3)), format='png')
def test_get_worldpop_data(self, monkeypatch, tmp_path): config = Config() config.WORLDPOP_DIR = "TestWorldPop" config.AGE_CLASSES = [10, 20, 30] input_dir = tmp_path / "TestInputDir" # input_dir.mkdir() downloads = [] def mock_download_ftp(url, save_path): downloads.append((url, save_path)) monkeypatch.setattr(utils, "download_ftp", mock_download_ftp) get_worldpop_data("XYZ", os.fspath(input_dir), config) assert (len(downloads) == len(config.AGE_CLASSES) * len(config.GENDER_CLASSES) + 2) files = [f"xyz_{b}_{a}_2020.tif" for a in (10, 20, 30) for b in "fm"] for f, (url, save) in zip(files, downloads): assert url.endswith(f) assert save.endswith(f)
def exposure(country_iso3, download_worldpop=False, config=None): # Get parameters file if config is None: config = Config() parameters = config.parameters(country_iso3) input_dir = os.path.join(config.DIR_PATH, config.INPUT_DIR, country_iso3) # Get input boundary shape file ADM2boundaries = utils.read_in_admin_boundaries(config, parameters, country_iso3) # Download the worldpop data if download_worldpop: get_worldpop_data(country_iso3, input_dir, config) # gender and age groups gender_age_groups = list( itertools.product(config.GENDER_CLASSES, config.AGE_CLASSES)) for gender_age_group in gender_age_groups: gender_age_group_name = f"{gender_age_group[0]}_{gender_age_group[1]}" logger.info(f"analyising gender age {gender_age_group_name}") input_tiff_file = os.path.join( input_dir, config.WORLDPOP_DIR, config.WORLDPOP_FILENAMES["sadd"].format( country_iso3=country_iso3.lower(), gender=gender_age_group[0], age=gender_age_group[1], ), ) raster = rasterio.open(input_tiff_file) ADM2boundaries[gender_age_group_name] = ( ADM2boundaries['geometry'].apply( lambda x: mask(raster, [x], crop=True, nodata=0)[0].sum())) # get total pops for pop_type, cname in zip(["pop", "unadj"], ["tot_pop_WP", "tot_pop_UN"]): logger.info(f"adding {pop_type}") input_tiff_pop = os.path.join( input_dir, config.WORLDPOP_DIR, config.WORLDPOP_FILENAMES[pop_type].format( country_iso3=country_iso3.lower()), ) raster = rasterio.open(input_tiff_pop) ADM2boundaries[cname] = (ADM2boundaries['geometry'].apply( lambda x: mask(raster, [x], crop=True, nodata=0)[0].sum())) # total from disaggregated logger.info("scaling SADD data to match UN Adjusted population estimates") gender_age_group_names = [ "{}_{}".format(gender_age_group[0], gender_age_group[1]) for gender_age_group in gender_age_groups ] for index, row in ADM2boundaries.iterrows(): tot_UN = row["tot_pop_UN"] tot_sad = row[gender_age_group_names].sum() try: ADM2boundaries.loc[index, gender_age_group_names] *= tot_UN / tot_sad except ZeroDivisionError: region_name = row[f'ADM2_{parameters["admin"]["language"]}'] logger.warning( f"The sum across all genders and ages for admin region {region_name} is 0" ) if "pop_co" in parameters: print("Further scaling SADD data to match CO estimates") # scaling at the ADM1 level to match figures used by Country Office instead of UN stats input_pop_co_filename = os.path.join(input_dir, config.CO_DIR, parameters["pop_co"]["filename"]) df_operational_figures = pd.read_excel(input_pop_co_filename, usecols="A,D") df_operational_figures["Province"] = df_operational_figures[ "Province"].replace(parameters["pop_co"]["province_names"]) # creating dictionary and add pcode the pcode ADM1_names = dict() for k, v in ADM2boundaries.groupby("ADM1_EN"): ADM1_names[k] = v.iloc[0, :].ADM1_PCODE df_operational_figures["ADM1_PCODE"] = df_operational_figures[ "Province"].map(ADM1_names) if df_operational_figures["ADM1_PCODE"].isnull().sum() > 0: print( "missing PCODE for: ", df_operational_figures[ df_operational_figures["ADM1_PCODE"].isnull()], ) # get total by ADM1 tot_co_adm1 = df_operational_figures.groupby( "ADM1_PCODE").sum()["Estimated Population - 2020"] tot_sad_adm1 = (ADM2boundaries.groupby("ADM1_PCODE") [gender_age_group_names].sum().sum(axis=1)) for index, row in ADM2boundaries.iterrows(): adm1_pcode = row["ADM1_PCODE"] pop_co = tot_co_adm1.get(adm1_pcode) pop_sad = tot_sad_adm1.get(adm1_pcode) ADM2boundaries.loc[index, gender_age_group_names] *= pop_co / pop_sad ADM2boundaries["tot_sad"] = ADM2boundaries.loc[:, gender_age_group_names].sum( axis=1) # adding manually Kochi nomads if "kochi" in parameters: logger.info("Adding Kochi") ADM1_kochi = parameters["kochi"]["adm1"] # total population in these provinces pop_in_kochi_ADM1 = ADM2boundaries[ADM2boundaries["ADM1_PCODE"].isin( ADM1_kochi)]["tot_sad"].sum() for row_index, row in ADM2boundaries.iterrows(): if row["ADM1_PCODE"] in ADM1_kochi: tot_kochi_in_ADM2 = 0 for gender_age_group in gender_age_groups: # population weighted gender_age_group_name = ( f"{gender_age_group[0]}_{gender_age_group[1]}") kochi_pp = parameters["kochi"]["total"] * ( row[gender_age_group_name] / pop_in_kochi_ADM1) ADM2boundaries.loc[row_index, gender_age_group_name] = ( row[gender_age_group_name] + kochi_pp) tot_kochi_in_ADM2 += kochi_pp ADM2boundaries.loc[row_index, "kochi"] = tot_kochi_in_ADM2 comment = f"Added in total {tot_kochi_in_ADM2} Kochi nomads to WorldPop estimates" ADM2boundaries.loc[row_index, "comment"] = comment # Write to file ADM2boundaries["created_at"] = str(datetime.datetime.now()) ADM2boundaries["created_by"] = getpass.getuser() output_geojson = get_output_filename(country_iso3, config) logger.info(f"Writing to file {output_geojson}") utils.write_to_geojson(output_geojson, ADM2boundaries)
def vulnerability(country_iso3, download_ghs=False, config=None): # Get config file if config is None: config = Config() parameters = config.parameters[country_iso3] # Get input boundary shape file input_dir = os.path.join(config.DIR_PATH, config.INPUT_DIR, country_iso3) input_shp = os.path.join( input_dir, config.SHAPEFILE_DIR, parameters["admin"]["directory"], f'{parameters["admin"]["directory"]}.shp', ) boundaries = gpd.read_file(input_shp).to_crs(config.GHS_CRS) # Download the tiles and read them in if download_ghs: get_ghs_data("SMOD", parameters["ghs"], country_iso3, input_dir, config) get_ghs_data("POP", parameters["ghs"], country_iso3, input_dir, config) ghs_smod = rasterio.open( os.path.join( input_dir, config.GHS_DIR, config.OUTPUT_GHS["SMOD"].format(country_iso3=country_iso3), )) ghs_pop = rasterio.open( os.path.join( input_dir, config.GHS_DIR, config.OUTPUT_GHS["POP"].format(country_iso3=country_iso3), )) # adding urban/rural disaggregation data using JRC GHSL input logger.info("Calculating urban population fraction") boundaries["frac_urban"] = boundaries["geometry"].apply( lambda x: calc_frac_urban(x, ghs_smod, ghs_pop, config)) # Get food insecurity logger.info("Getting food insecurity") boundaries = add_food_insecurity( parameters["ipc"], input_dir, boundaries, parameters["admin"]["language"], config, ) # Get solid fuels if "solid_fuels" in parameters: logger.info("Getting Solid Fuels data") boundaries = add_factor_urban_rural(boundaries, "fossil_fuels", parameters["solid_fuels"]) else: logger.info( f"Solid fuels data not available for country {country_iso3}") # Get handwashing facilities if "handwashing_facilities" in parameters: logger.info("Getting Handwashing facilities data") boundaries = add_factor_urban_rural( boundaries, "handwashing_facilities", parameters["handwashing_facilities"]) else: logger.info( f"Handwashing facilities data not available for country {country_iso3}" ) # Get raised blood pressure if "raised_blood_pressure" in parameters: logger.info("Getting Raised Blood Pressure data") add_factor_18plus( boundaries, parameters["raised_blood_pressure"], "raised_blood_pressure", country_iso3, config, ) else: logger.info( f"Raised blood pressure data not available for country {country_iso3}" ) # Get raised blood pressure if "diabetes" in parameters: logger.info("Getting diabetes data") add_factor_18plus(boundaries, parameters["diabetes"], "diabetes", country_iso3, config) else: logger.info(f"Diabetes data not available for country {country_iso3}") # Get smoking if "smoking" in parameters: logger.info("Getting smoking data") add_factor_18plus(boundaries, parameters["smoking"], "smoking", country_iso3, config) else: logger.info(f"Smoking data not available for country {country_iso3}") # Write out results output_dir = os.path.join( config.DIR_PATH, config.vulnerability_output_dir().format(country_iso3)) Path(output_dir).mkdir(parents=True, exist_ok=True) output_geojson = os.path.join( output_dir, config.VULNERABILITY_FILENAME.format(country_iso3=country_iso3)) logger.info(f"Saving results to {output_geojson}") utils.write_to_geojson(output_geojson, boundaries.to_crs(config.SHP_CRS))
def covid(country_iso3, download_covid=False, config=None): # Get config file if config is None: config = Config() parameters = config.parameters(country_iso3) # Get input covid file input_dir = os.path.join(config.DIR_PATH, config.INPUT_DIR, country_iso3) # Download latest covid file tiles and read them in if download_covid: get_covid_data(parameters["covid"], country_iso3, input_dir, config) df_covid = pd.read_csv( "{}/{}".format( os.path.join(input_dir, config.COVID_OUTPUT_DIR), parameters["covid"]["filename"], ), header=parameters["covid"]["header"], skiprows=parameters["covid"]["skiprows"], ) # drop duplicates. Some datasets have duplicated rows on HDX df_covid = df_covid.drop_duplicates() # convert to standard HLX if "hlx_dict" in parameters["covid"]: df_covid = df_covid.rename(columns=parameters["covid"]["hlx_dict"]) # in South Sudan we have individual case data which need to be aggregated at the ADM2 level if (parameters["covid"]["individual_case_data"] and parameters["covid"]["admin_level"] == 2): df_covid = pd.pivot_table( df_covid, index=[ config.HLX_TAG_DATE, config.HLX_TAG_ADM1_NAME, config.HLX_TAG_ADM2_NAME, ], aggfunc="count", ).reset_index() df_covid = df_covid.rename( columns={"Case No.": config.HLX_TAG_TOTAL_CASES}) df_covid = df_covid[[ config.HLX_TAG_DATE, config.HLX_TAG_ADM1_NAME, config.HLX_TAG_ADM2_NAME, config.HLX_TAG_TOTAL_CASES, ]] # convert to numeric if parameters["covid"]["cases"]: df_covid[config.HLX_TAG_TOTAL_CASES] = convert_to_numeric( df_covid[config.HLX_TAG_TOTAL_CASES]) if parameters["covid"]["deaths"]: df_covid[config.HLX_TAG_TOTAL_DEATHS] = convert_to_numeric( df_covid[config.HLX_TAG_TOTAL_DEATHS]) df_covid.fillna(0, inplace=True) # remove Total for spatially disaggregated data df_covid = df_covid[df_covid[config.HLX_TAG_ADM1_NAME] != "Total"] # cleaning up names before using the replace dictionary df_covid[config.HLX_TAG_ADM1_NAME] = df_covid[ config.HLX_TAG_ADM1_NAME].str.replace("Province", "") df_covid[config.HLX_TAG_ADM1_NAME] = df_covid[ config.HLX_TAG_ADM1_NAME].str.replace("State", "") df_covid[config.HLX_TAG_ADM1_NAME] = df_covid[ config.HLX_TAG_ADM1_NAME].str.strip() # apply replace dict to match ADM unit names in the COD with teh COVID data if ("replace_dict" in parameters["covid"] and parameters["covid"]["admin_level"] == 1): df_covid[config.HLX_TAG_ADM1_NAME] = df_covid[ config.HLX_TAG_ADM1_NAME].replace( parameters["covid"]["replace_dict"]) # Some datasets have mutliple rows corresponsing to the same ADM1 df_covid = df_covid.groupby( [config.HLX_TAG_ADM1_NAME, config.HLX_TAG_DATE]).sum().reset_index() if ("replace_dict" in parameters["covid"] and parameters["covid"]["admin_level"] == 2): df_covid[config.HLX_TAG_ADM2_NAME] = df_covid[ config.HLX_TAG_ADM2_NAME].replace( parameters["covid"]["replace_dict"]) # Some datasets have mutliple rows corresponsing to the same ADM2 df_covid = df_covid.groupby( [config.HLX_TAG_ADM2_NAME, config.HLX_TAG_DATE]).sum().reset_index() # Get exposure file try: exposure_file = f"{config.SADD_output_dir().format(country_iso3)}/{config.EXPOSURE_GEOJSON.format(country_iso3)}" exposure_gdf = gpd.read_file(exposure_file) except Exception as err: logger.error( f"Cannot get exposure file for {country_iso3}, COVID file not generate" ) raise err output_fields = [ config.HLX_TAG_ADM1_PCODE, config.HLX_TAG_ADM2_PCODE, config.HLX_TAG_DATE, config.HLX_TAG_TOTAL_CASES, config.HLX_TAG_TOTAL_DEATHS, ] output_df_covid = pd.DataFrame(columns=output_fields) ADM2_ADM1_pcodes = get_dict_pcodes(exposure_gdf, "ADM2_PCODE") ADM0_CFR = 0 if not parameters["covid"]["deaths"]: # missing death data, getting it from WHO at the national level who_df = get_WHO_data(config, country_iso3,hxlize=False,\ smooth_data=parameters['WHO']['smooth_data'],n_days_smoothing=parameters['WHO']['n_days_smoothing']) who_df['Date_reported'] = pd.to_datetime(who_df['Date_reported']) who_df = who_df.sort_values(by='Date_reported') who_df = who_df.set_index('Date_reported') latest_date = who_df.tail(1).index.values[0] # get the CFR from the latest month, to account for recent reporting rate estimation who_df = who_df.loc[latest_date - np.timedelta64(30, 'D'):latest_date] deaths = who_df.iloc[-1]['Cumulative_deaths'] - who_df.iloc[0][ 'Cumulative_deaths'] cases = who_df.iloc[-1]['Cumulative_cases'] - who_df.iloc[0][ 'Cumulative_cases'] ADM0_CFR = deaths / cases if deaths < 100 or ADM0_CFR > 0.3: # if it's less than 100 use the cumulative to reduce noise # When there are adjustments to the data we may have a jump in the CFR # calcualted from the latest month,i t should be captured by the ADM0_CFR> 0.3 deaths = who_df.iloc[-1]['Cumulative_deaths'] cases = who_df.iloc[-1]['Cumulative_cases'] ADM0_CFR = deaths / cases if parameters["covid"]["admin_level"] == 2: ADM2_names = get_dict_pcodes(exposure_gdf, parameters["covid"]["adm2_name_exp"], "ADM2_PCODE") df_covid[config.HLX_TAG_ADM2_PCODE] = df_covid[ config.HLX_TAG_ADM2_NAME].map(ADM2_names) if df_covid[config.HLX_TAG_ADM2_PCODE].isnull().sum() > 0: logger.warning("missing PCODE for the following admin units ") logger.warning( df_covid[df_covid[config.HLX_TAG_ADM2_PCODE].isnull()][ config.HLX_TAG_ADM2_NAME].values) # print(df_covid) return df_covid[config.HLX_TAG_ADM1_PCODE] = df_covid[ config.HLX_TAG_ADM2_PCODE].map(ADM2_ADM1_pcodes) adm1pcode = df_covid[config.HLX_TAG_ADM1_PCODE] adm2pcodes = df_covid[config.HLX_TAG_ADM2_PCODE] date = pd.to_datetime(df_covid[config.HLX_TAG_DATE], format=parameters["covid"]["date_format"]) date = date.dt.strftime("%Y-%m-%d") adm2cases = (df_covid[config.HLX_TAG_TOTAL_CASES] if parameters["covid"]["cases"] else None) adm2deaths = (df_covid[config.HLX_TAG_TOTAL_DEATHS] if parameters["covid"]["deaths"] else None) if not parameters["covid"]["deaths"]: adm2deaths = [cases * ADM0_CFR for cases in adm2cases] raw_data = { config.HLX_TAG_ADM1_PCODE: adm1pcode, config.HLX_TAG_ADM2_PCODE: adm2pcodes, config.HLX_TAG_DATE: date, config.HLX_TAG_TOTAL_CASES: adm2cases, config.HLX_TAG_TOTAL_DEATHS: adm2deaths, } output_df_covid = output_df_covid.append(pd.DataFrame(raw_data), ignore_index=True) elif parameters["covid"]["admin_level"] == 1: if parameters["covid"].get("federal_state_dict", False): # for Somalia we replace the ADM1_PCODE the name of the ADM1 and with the name of the state # this is done according to the dictionary exposure_gdf["ADM1_PCODE"] = exposure_gdf[ parameters["covid"]["adm1_name_exp"]].replace( parameters["covid"]["federal_state_dict"]) exposure_gdf[parameters["covid"] ["adm1_name_exp"]] = exposure_gdf["ADM1_PCODE"] # get dictionary of ADM1 pcodes ADM1_names = get_dict_pcodes(exposure_gdf, parameters["covid"]["adm1_name_exp"], "ADM1_PCODE") # create new column with pcodes df_covid[config.HLX_TAG_ADM1_PCODE] = df_covid[ config.HLX_TAG_ADM1_NAME].map(ADM1_names) # check if any pcode is missing if df_covid[config.HLX_TAG_ADM1_PCODE].isnull().sum() > 0: logger.warning("missing PCODE for the following admin units :") logger.warning( df_covid[df_covid[config.HLX_TAG_ADM1_PCODE].isnull()][ config.HLX_TAG_ADM1_NAME].values) # get the full list of gender/age combinations to calculate the sum of population in adm2_pop_fractions # in principle we could use the sum in the exposure but it's safer to recalculate it gender_age_groups = list( itertools.product(config.GENDER_CLASSES, config.AGE_CLASSES)) gender_age_group_names = [ "{}_{}".format(gender_age_group[0], gender_age_group[1]) for gender_age_group in gender_age_groups ] for _, row in df_covid.iterrows(): adm2_pop_fractions = get_adm2_to_adm1_pop_frac( row[config.HLX_TAG_ADM1_PCODE], exposure_gdf, gender_age_group_names) adm1pcode = row[config.HLX_TAG_ADM1_PCODE] date = datetime.datetime.strptime( row[config.HLX_TAG_DATE], parameters["covid"]["date_format"]).strftime("%Y-%m-%d") adm2cases = scale_adm1_by_adm2_pop( parameters["covid"]["cases"], config.HLX_TAG_TOTAL_CASES, row, adm2_pop_fractions, ) adm2deaths = scale_adm1_by_adm2_pop( parameters["covid"]["deaths"], config.HLX_TAG_TOTAL_DEATHS, row, adm2_pop_fractions, ) if not parameters["covid"]["deaths"]: adm2deaths = [cases * ADM0_CFR for cases in adm2cases] adm2pcodes = [v for v in adm2_pop_fractions.keys()] raw_data = { config.HLX_TAG_ADM1_PCODE: adm1pcode, config.HLX_TAG_ADM2_PCODE: adm2pcodes, config.HLX_TAG_DATE: date, config.HLX_TAG_TOTAL_CASES: adm2cases, config.HLX_TAG_TOTAL_DEATHS: adm2deaths, } output_df_covid = output_df_covid.append(pd.DataFrame(raw_data), ignore_index=True) else: logger.error(f"Missing admin_level info for COVID data") # cross-check: the total must match if (abs((output_df_covid[config.HLX_TAG_TOTAL_CASES].sum() - df_covid[config.HLX_TAG_TOTAL_CASES].sum())) > 10): logger.warning("The sum of input and output files don't match") if not parameters["covid"]["cumulative"]: logger.info(f"Calculating cumulative numbers COVID data") groups = [ config.HLX_TAG_ADM1_PCODE, config.HLX_TAG_ADM2_PCODE, config.HLX_TAG_DATE, ] # TODO check this was not numeric in the case of SSD output_df_covid[config.HLX_TAG_TOTAL_CASES] = pd.to_numeric( output_df_covid[config.HLX_TAG_TOTAL_CASES]) output_df_covid[config.HLX_TAG_TOTAL_DEATHS] = pd.to_numeric( output_df_covid[config.HLX_TAG_TOTAL_DEATHS]) # get sum by day (in case multiple reports per day) output_df_covid = (output_df_covid.groupby(groups).sum().sort_values( by=config.HLX_TAG_DATE)) # get cumsum by day (grouped by ADM2) output_df_covid = (output_df_covid.groupby( config.HLX_TAG_ADM2_PCODE).cumsum().reset_index()) if parameters["covid"].get("federal_state_dict", False): # bring back the adm1 pcode that we modified to calculate the sum output_df_covid[config.HLX_TAG_ADM1_PCODE] = output_df_covid[ config.HLX_TAG_ADM2_PCODE].map(ADM2_ADM1_pcodes) # Write to file output_df_covid["created_at"] = str(datetime.datetime.now()) output_df_covid["created_by"] = getpass.getuser() output_csv = get_output_filename(country_iso3, config) logger.info(f"Writing to file {output_csv}") output_df_covid.to_csv(output_csv, index=False)