def main(): config_filename = Path(__file__).parent / "data_processing_config.yaml" uri = "data_processing_uri" git_sha = "data_processing_git_sha" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: # first get the comix comix_external = "https://cmmid.github.io/topics/covid19/reports/20200327_comix_social_contacts.xlsx" with api.read_external_object(comix_external, 'only') as file: df_comix = pd.read_excel(file, sheet_name="All_contacts_imputed") df_comix = df_comix.set_index("Unnamed: 0") # then get the population - ONS, and NRS ons_external = "wales_england_pop.csv" with api.read_external_object(ons_external, 'only') as file: ons_pop = pd.read_csv("human/wales_england_pop.csv", index_col="AGE").POPULATION nrs_internal = "human/demographics/population/scotland/1.0.0.h5" data_plus_lists = api.read_array(nrs_internal, "health_board/age/persons") data = data_plus_lists[0] placeNames = list(data_plus_lists[1][0][1]) ageNames = list(data_plus_lists[1][1][1]) # print(placeNames) ages = [int(s.replace("AGE", "").replace("+", "")) for s in ageNames] df = pd.DataFrame(data, index=ages, columns=placeNames).T nrs_pop = df.sum() data = Data(comix=df_comix, population=ons_pop + nrs_pop) contacts = comix_to_contacts( data.comix, _aggregate_pop_full_comix(data.population, data.comix)) contacts = split_17_years_old(contacts, data.population) contacts = collapse_columns(contacts, ["[0,5)", "[5,17)"], "[0,17)") contacts = collapse_columns( contacts, ["17", "[18,30)", "[30,40)", "[40,50)", "[50,60)", "[60,70)"], "[17,70)", ) # The 70+ entry is already what we want comix = contacts_to_comix( contacts, _aggregate_pop_simplified_comix(data.population, contacts)) flattened = _flatten(comix) flattened.to_csv("mixing-matrix.csv", index=False) api.write_table( "generated_sns_products/simplified_comix_matrix", "simplified_comix_matrix", flattened, )
def main(): dfPop, totalPop = download_pop_table() dfLookup, codes_of_interest = download_lookup_table() # We only want the iso-code and the population weighting so add a column for it and remove the extranoeus columns dfLookup = dfLookup.join(dfPop.set_index("la_code"), on="la_code") dfLookup["pop_weighting"] = dfLookup["population"] / totalPop dfLookup = dfLookup[["full_iso_code", "pop_weighting"]] dfLookup.set_index("full_iso_code", inplace=True) dfScotGoogle = download_google_mogility_data(codes_of_interest) dfScotGoogle["movements_for_decrease"] = ( dfScotGoogle["transit_stations_percent_change_from_baseline"] + dfScotGoogle["workplaces_percent_change_from_baseline"] + dfScotGoogle["retail_and_recreation_percent_change_from_baseline"]) / 3 dfScotGoogle = dfScotGoogle[[ "iso_3166_2_code", "date", "movements_for_decrease" ]] dfScotGoogle = dfScotGoogle.merge(dfLookup, left_on="iso_3166_2_code", right_index=True, how="left") dfScotGoogle["weighted_moves"] = (dfScotGoogle["movements_for_decrease"] * dfScotGoogle["pop_weighting"]) dfScotGoogle = dfScotGoogle[["date", "weighted_moves"]] dfScotGoogle["date"] = pd.to_datetime(dfScotGoogle["date"]) dfScotGoogle = dfScotGoogle.groupby("date").sum() dfScotGoogle[ "weighted_moves"] = 1.0 + dfScotGoogle["weighted_moves"] / 100.0 dfScotGoogle.index = pd.to_datetime(dfScotGoogle.index) dfScotGoogle = dfScotGoogle[dfScotGoogle.index.dayofweek < 5] # Now upload this to the database; human/movement-multipliers/1/data.h5 movement_multiplier_table = "movement_multiplier.csv" dfScotGoogle.to_csv(movement_multiplier_table) with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: api.write_table( "generated_sns_products/movement_multiplier", "movement_multiplier", dfScotGoogle, )
def download_google_mogility_data(la_list): """ Downloads the mobility data from google for Scottish local authorities if it doesn't exists in data_path :param: The list of local authorities for which we want the movement data :return: A dataframe containing the full ISO code (GB-iso_3166_2) and the corresponding local authority code """ google_mobility_table = "Global_Mobility_Report.csv" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: with api.read_external_object(google_mobility_table, "only") as file: justScotGoogle = pd.read_csv(file, low_memory=False) justScotGoogle = justScotGoogle[ justScotGoogle["iso_3166_2_code"].isin(la_list)] return justScotGoogle
def download_lookup_table(): """ Downloads the mapping of iso_3166_2 codes to local authority from the SCRC database, if it doesn't exist, upload it :return: A dataframe containing the full ISO code (GB-iso_3166_2) and the corresponding local authority code. """ # ISO region to LA best-attempt lookup table: compiled by hand, Jess Enright, 30 June 2020 lookup_table = "iso-3166-2_to_scottishLA.csv" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: with api.read_external_object(lookup_table, "only") as file: dfLookup = pd.read_csv(file, low_memory=False) dfLookup["full_iso_code"] = "GB-" + dfLookup.iso_3166_2 codes_of_interest = list(dfLookup["full_iso_code"]) dfLookup = dfLookup[["full_iso_code", "la_code"]] return dfLookup, codes_of_interest
def test_read_external_object_no_component(tmp_path): with open(tmp_path / "config.yaml", "w") as config_file: config_file.write(""" data_directory: . access_log: False fail_on_hash_mismatch: False """) with open(tmp_path / "metadata.yaml", "w") as metadata_file: metadata_file.write(""" - doi_or_unique_name: doi title: title filename: data.txt """) with open(tmp_path / "data.txt", "w") as data_file: data_file.write("hello world") with DataProcessingAPI.from_config(tmp_path / "config.yaml", "uri", "sha") as api: with api.read_external_object("doi", "title") as file: assert file.read().decode() == "hello world"
def main(): config_filename = Path(__file__).parent / "data_processing_config.yaml" uri = "data_processing_uri" git_sha = "data_processing_git_sha" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: nrs_internal = "human/demographics/population/scotland/1.0.0.h5" data_plus_lists = api.read_array(nrs_internal, "health_board/age/genders") female_data = data_plus_lists[0][0] male_data = data_plus_lists[0][1] placeNames = list(data_plus_lists[1][0][1]) ageNames = list(data_plus_lists[1][1][1]) ages = [int(s.replace("AGE", "").replace("+", "")) for s in ageNames] female_pop = pd.DataFrame(female_data, index=ages, columns=placeNames).T male_pop = pd.DataFrame(male_data, index=ages, columns=placeNames).T age_class_dict = { "[0,17)": range(0, 17), "[17,70)": range(17, 70), "70+": range(70, 91), } aggFemale = aggregate_columns_and_rename(female_pop, age_class_dict, "Female") aggMale = aggregate_columns_and_rename(male_pop, age_class_dict, "Male") aggTogether = pd.concat([aggFemale, aggMale]) aggTogether = aggTogether.set_index(["Health_Board", "Sex"]).stack() aggTogether = aggTogether.reset_index() aggTogether.columns = ["Health_Board", "Sex", "Age", "Total"] aggTogether.to_csv("check_pop_table.csv", index=False) api.write_table( "generated_sns_products/population_healthboards_scotland", "population_healthboards_scotland", aggTogether, )
def download_pop_table(): """ Download the population data from an external source if it doesn't exists in data_path, using only the Area1 data and the first 3 columns the LA code, name and population, removing the commas in the population numbers. :return: A dataframe containing the local authority code, name and population and the total polulation """ population_table = "mid-year-pop-est-18-tabs_Table 2.csv" # The downloading below isn't currently in use - we want to have these script not directly download anything # but instead handle that as part of the database. However, I've left it here as a record, in case we need to include it again # If the population table doesn't exist download it. # if not Path(data_path / population_table).exists(): # print(f"Could not find {data_path}/{population_table}, downloading it") # url = "https://www.nrscotland.gov.uk/files//statistics/population-estimates/mid-18/mid-year-pop-est-18-tabs.zip" # zip_filename = "mid-year-pop-est-18-tabs.zip" # urllib.request.urlretrieve( # url, zip_filename # ) # # with zipfile.ZipFile(zip_filename, 'r') as zip_ref: # zip_ref.extractall(data_path, members=[population_table]) # # # clean up (i.e. remove) the downloaded datafile(s) # Path(zip_filename).unlink(missing_ok=False) with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: with api.read_external_object(population_table, "only") as file: dfPop = pd.read_csv(file, skiprows=5, nrows=32, usecols=[0, 1, 2]) dfPop.columns = ["la_code", "la_name", "population"] dfPop["population"] = dfPop["population"].str.replace( ",", "").astype(int) total_population = dfPop["population"].sum() return dfPop, total_population
How to run this module: ``` This script can be run with ``` python generating_rates_from_epidemiological_params.py ``` The script generates a .h5 file of compartment transition rates, in the location specified in data_processing_config.yaml """ from pathlib import Path import pandas as pd from data_pipeline_api.data_processing_api import DataProcessingAPI config_filename = Path(__file__).parent / "data_processing_config.yaml" uri = "data_processing_uri" git_sha = "data_processing_git_sha" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: # States: S E A A_2 I H R D # We need to generate rates for transitions: # E E # E A_2 # A_2 A_2 # A_2 R # E A # A A # A I # I I # I D # I H # I R # H H
def main(): config_filename = Path(__file__).parent / "data_processing_config.yaml" uri = "data_processing_uri" git_sha = "data_processing_git_sha" with DataProcessingAPI.from_config(config_filename, uri=uri, git_sha=git_sha) as api: # The lookup table file below is available externally from # https://www2.gov.scot/Resource/0046/00462936.csv upward = "00462936.csv" # The commutes file below is available from http://wicid.ukdataservice.ac.uk/ to academics and local # government after making an account and agreeing to a EULA for access to safeguarded data flows = "wu03buk_oa_wz_v4.csv" with api.read_external_object(upward, 'only') as file: dfUp = pd.read_csv(file) dfUp = dfUp[["OutputArea", "DataZone", "InterZone"]] dfUp = dfUp.set_index("OutputArea") with api.read_external_object(flows, 'only') as file: dfMoves = pd.read_csv( file, names=[ "sourceOA", "destOA", "total", "breakdown1", "breakdown2", "breakdown3", ], ) withSourceDZ = dfMoves.merge(dfUp, how="inner", left_on="sourceOA", right_index=True) withBothDZ = withSourceDZ.merge(dfUp, how="inner", left_on="destOA", right_index=True) withBothIZ = withBothDZ[["InterZone_x", "InterZone_y", "total"]] withBothIZ.columns = ["source_IZ", "dest_IZ", "weight"] withBothIZ = withBothIZ.groupby(["source_IZ", "dest_IZ"]).sum() withBothIZ = withBothIZ.reset_index() withBothDZ = withBothDZ[["DataZone_x", "DataZone_y", "total"]] withBothDZ.columns = ["source_DZ", "dest_DZ", "weight"] withBothDZ = withBothDZ.groupby(["source_DZ", "dest_DZ"]).sum() withBothDZ = withBothDZ.reset_index() api.write_table( "generated_sns_products/wu03buk_oa_wz_v4_scottish_datazones", "wu03buk_oa_wz_v4_scottish_datazones", withBothDZ, ) api.write_table( "generated_sns_products/wu03buk_oa_wz_v4_scottish_interzones", "wu03buk_oa_wz_v4_scottish_interzones", withBothIZ, )