Python DataProcessingAPI示例，data_pipeline_api.data_processing_api.DataProcessingAPI Python示例

示例#1

0

显示文件

文件： comix_downsampler_data_processing_extractor.py 项目： ScottishCovidResponse/simple_network_sim

def main():
    config_filename = Path(__file__).parent / "data_processing_config.yaml"
    uri = "data_processing_uri"
    git_sha = "data_processing_git_sha"
    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:

        # first get the comix
        comix_external = "https://cmmid.github.io/topics/covid19/reports/20200327_comix_social_contacts.xlsx"
        with api.read_external_object(comix_external, 'only') as file:
            df_comix = pd.read_excel(file, sheet_name="All_contacts_imputed")
        df_comix = df_comix.set_index("Unnamed: 0")
        # then get the population - ONS, and NRS
        ons_external = "wales_england_pop.csv"
        with api.read_external_object(ons_external, 'only') as file:
            ons_pop = pd.read_csv("human/wales_england_pop.csv",
                                  index_col="AGE").POPULATION

        nrs_internal = "human/demographics/population/scotland/1.0.0.h5"
        data_plus_lists = api.read_array(nrs_internal,
                                         "health_board/age/persons")
        data = data_plus_lists[0]
        placeNames = list(data_plus_lists[1][0][1])
        ageNames = list(data_plus_lists[1][1][1])
        # print(placeNames)

        ages = [int(s.replace("AGE", "").replace("+", "")) for s in ageNames]
        df = pd.DataFrame(data, index=ages, columns=placeNames).T

        nrs_pop = df.sum()

        data = Data(comix=df_comix, population=ons_pop + nrs_pop)
        contacts = comix_to_contacts(
            data.comix, _aggregate_pop_full_comix(data.population, data.comix))

        contacts = split_17_years_old(contacts, data.population)

        contacts = collapse_columns(contacts, ["[0,5)", "[5,17)"], "[0,17)")
        contacts = collapse_columns(
            contacts,
            ["17", "[18,30)", "[30,40)", "[40,50)", "[50,60)", "[60,70)"],
            "[17,70)",
        )
        # The 70+ entry is already what we want

        comix = contacts_to_comix(
            contacts, _aggregate_pop_simplified_comix(data.population,
                                                      contacts))

        flattened = _flatten(comix)
        flattened.to_csv("mixing-matrix.csv", index=False)
        api.write_table(
            "generated_sns_products/simplified_comix_matrix",
            "simplified_comix_matrix",
            flattened,
        )

示例#2

0

显示文件

def main():

    dfPop, totalPop = download_pop_table()
    dfLookup, codes_of_interest = download_lookup_table()

    # We only want the iso-code and the population weighting so add a column for it and remove the extranoeus columns
    dfLookup = dfLookup.join(dfPop.set_index("la_code"), on="la_code")
    dfLookup["pop_weighting"] = dfLookup["population"] / totalPop

    dfLookup = dfLookup[["full_iso_code", "pop_weighting"]]
    dfLookup.set_index("full_iso_code", inplace=True)

    dfScotGoogle = download_google_mogility_data(codes_of_interest)

    dfScotGoogle["movements_for_decrease"] = (
        dfScotGoogle["transit_stations_percent_change_from_baseline"] +
        dfScotGoogle["workplaces_percent_change_from_baseline"] +
        dfScotGoogle["retail_and_recreation_percent_change_from_baseline"]) / 3

    dfScotGoogle = dfScotGoogle[[
        "iso_3166_2_code", "date", "movements_for_decrease"
    ]]

    dfScotGoogle = dfScotGoogle.merge(dfLookup,
                                      left_on="iso_3166_2_code",
                                      right_index=True,
                                      how="left")
    dfScotGoogle["weighted_moves"] = (dfScotGoogle["movements_for_decrease"] *
                                      dfScotGoogle["pop_weighting"])
    dfScotGoogle = dfScotGoogle[["date", "weighted_moves"]]
    dfScotGoogle["date"] = pd.to_datetime(dfScotGoogle["date"])

    dfScotGoogle = dfScotGoogle.groupby("date").sum()
    dfScotGoogle[
        "weighted_moves"] = 1.0 + dfScotGoogle["weighted_moves"] / 100.0
    dfScotGoogle.index = pd.to_datetime(dfScotGoogle.index)

    dfScotGoogle = dfScotGoogle[dfScotGoogle.index.dayofweek < 5]

    # Now upload this to the database; human/movement-multipliers/1/data.h5
    movement_multiplier_table = "movement_multiplier.csv"
    dfScotGoogle.to_csv(movement_multiplier_table)
    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:
        api.write_table(
            "generated_sns_products/movement_multiplier",
            "movement_multiplier",
            dfScotGoogle,
        )

示例#3

0

显示文件

def download_google_mogility_data(la_list):
    """
    Downloads the mobility data from google for Scottish local authorities if it doesn't exists in data_path
    :param: The list of local authorities for which we want the movement data
    :return: A dataframe containing the full ISO code (GB-iso_3166_2) and the corresponding local authority code
    """

    google_mobility_table = "Global_Mobility_Report.csv"

    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:
        with api.read_external_object(google_mobility_table, "only") as file:
            justScotGoogle = pd.read_csv(file, low_memory=False)
            justScotGoogle = justScotGoogle[
                justScotGoogle["iso_3166_2_code"].isin(la_list)]

    return justScotGoogle

示例#4

0

显示文件

def download_lookup_table():
    """
    Downloads the mapping of iso_3166_2 codes to local authority from the SCRC database,
    if it doesn't exist, upload it
    :return: A dataframe containing the full ISO code (GB-iso_3166_2) and the corresponding local authority code.
    """
    # ISO region to LA best-attempt lookup table: compiled by hand, Jess Enright, 30 June 2020
    lookup_table = "iso-3166-2_to_scottishLA.csv"

    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:
        with api.read_external_object(lookup_table, "only") as file:
            dfLookup = pd.read_csv(file, low_memory=False)
            dfLookup["full_iso_code"] = "GB-" + dfLookup.iso_3166_2
            codes_of_interest = list(dfLookup["full_iso_code"])
            dfLookup = dfLookup[["full_iso_code", "la_code"]]

    return dfLookup, codes_of_interest

示例#5

0

显示文件

文件： test_data_processing_api.py 项目： ScottishCovidResponse/data_pipeline_api

def test_read_external_object_no_component(tmp_path):
    with open(tmp_path / "config.yaml", "w") as config_file:
        config_file.write("""
data_directory: .
access_log: False
fail_on_hash_mismatch: False
        """)
    with open(tmp_path / "metadata.yaml", "w") as metadata_file:
        metadata_file.write("""
- doi_or_unique_name: doi
  title: title
  filename: data.txt
        """)
    with open(tmp_path / "data.txt", "w") as data_file:
        data_file.write("hello world")
    with DataProcessingAPI.from_config(tmp_path / "config.yaml", "uri",
                                       "sha") as api:
        with api.read_external_object("doi", "title") as file:
            assert file.read().decode() == "hello world"

示例#6

0

显示文件

def main():
    config_filename = Path(__file__).parent / "data_processing_config.yaml"
    uri = "data_processing_uri"
    git_sha = "data_processing_git_sha"
    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:

        nrs_internal = "human/demographics/population/scotland/1.0.0.h5"
        data_plus_lists = api.read_array(nrs_internal,
                                         "health_board/age/genders")

        female_data = data_plus_lists[0][0]
        male_data = data_plus_lists[0][1]
        placeNames = list(data_plus_lists[1][0][1])
        ageNames = list(data_plus_lists[1][1][1])

        ages = [int(s.replace("AGE", "").replace("+", "")) for s in ageNames]
        female_pop = pd.DataFrame(female_data, index=ages,
                                  columns=placeNames).T
        male_pop = pd.DataFrame(male_data, index=ages, columns=placeNames).T

        age_class_dict = {
            "[0,17)": range(0, 17),
            "[17,70)": range(17, 70),
            "70+": range(70, 91),
        }

        aggFemale = aggregate_columns_and_rename(female_pop, age_class_dict,
                                                 "Female")
        aggMale = aggregate_columns_and_rename(male_pop, age_class_dict,
                                               "Male")
        aggTogether = pd.concat([aggFemale, aggMale])
        aggTogether = aggTogether.set_index(["Health_Board", "Sex"]).stack()
        aggTogether = aggTogether.reset_index()
        aggTogether.columns = ["Health_Board", "Sex", "Age", "Total"]
        aggTogether.to_csv("check_pop_table.csv", index=False)

        api.write_table(
            "generated_sns_products/population_healthboards_scotland",
            "population_healthboards_scotland",
            aggTogether,
        )

示例#7

0

显示文件

def download_pop_table():
    """
    Download the population data from an external source if it doesn't exists in data_path, using only the Area1 data and the first 3 columns
    the LA code, name and population, removing the commas in the population numbers.
    :return: A dataframe containing the local authority code, name and population
             and the total polulation
    """

    population_table = "mid-year-pop-est-18-tabs_Table 2.csv"
    #  The downloading below isn't currently in use - we want to have these script not directly download anything
    #  but instead handle that as part of the database.  However, I've left it here as a record, in case we need to include it again
    # If the population table doesn't exist download it.
    # if not Path(data_path / population_table).exists():
    #     print(f"Could not find {data_path}/{population_table}, downloading it")
    #     url = "https://www.nrscotland.gov.uk/files//statistics/population-estimates/mid-18/mid-year-pop-est-18-tabs.zip"
    #     zip_filename = "mid-year-pop-est-18-tabs.zip"
    #     urllib.request.urlretrieve(
    #         url, zip_filename
    #     )
    #
    #     with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    #         zip_ref.extractall(data_path, members=[population_table])
    #
    #     # clean up (i.e. remove) the downloaded datafile(s)
    #     Path(zip_filename).unlink(missing_ok=False)

    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:
        with api.read_external_object(population_table, "only") as file:
            dfPop = pd.read_csv(file, skiprows=5, nrows=32, usecols=[0, 1, 2])
            dfPop.columns = ["la_code", "la_name", "population"]
            dfPop["population"] = dfPop["population"].str.replace(
                ",", "").astype(int)

    total_population = dfPop["population"].sum()

    return dfPop, total_population

示例#8

0

显示文件

文件： generating_rates_from_epidemiological_params.py 项目： ScottishCovidResponse/simple_network_sim

How to run this module:
```
This script can be run with
```
python generating_rates_from_epidemiological_params.py
```
The script generates a .h5 file of compartment transition rates, in the location specified in  data_processing_config.yaml
"""
from pathlib import Path
import pandas as pd
from data_pipeline_api.data_processing_api import DataProcessingAPI

config_filename = Path(__file__).parent / "data_processing_config.yaml"
uri = "data_processing_uri"
git_sha = "data_processing_git_sha"
with DataProcessingAPI.from_config(config_filename, uri=uri,
                                   git_sha=git_sha) as api:

    # States: S E A A_2 I H R D
    # We need to generate rates for transitions:
    # E E
    # E	A_2
    # A_2	A_2
    # A_2	R
    # E	A
    # A	A
    # A	I
    # I	I
    # I	D
    # I	H
    # I	R
    # H	H

示例#9

0

显示文件

文件： aggregating_flows_from_commute_data.py 项目： ScottishCovidResponse/simple_network_sim

def main():
    config_filename = Path(__file__).parent / "data_processing_config.yaml"
    uri = "data_processing_uri"
    git_sha = "data_processing_git_sha"
    with DataProcessingAPI.from_config(config_filename,
                                       uri=uri,
                                       git_sha=git_sha) as api:
        # The lookup table file below is available externally from
        # https://www2.gov.scot/Resource/0046/00462936.csv
        upward = "00462936.csv"
        # The commutes file below is available from  http://wicid.ukdataservice.ac.uk/ to academics and local
        # government after making an account and agreeing to a EULA for access to safeguarded data
        flows = "wu03buk_oa_wz_v4.csv"

        with api.read_external_object(upward, 'only') as file:
            dfUp = pd.read_csv(file)

        dfUp = dfUp[["OutputArea", "DataZone", "InterZone"]]
        dfUp = dfUp.set_index("OutputArea")

        with api.read_external_object(flows, 'only') as file:
            dfMoves = pd.read_csv(
                file,
                names=[
                    "sourceOA",
                    "destOA",
                    "total",
                    "breakdown1",
                    "breakdown2",
                    "breakdown3",
                ],
            )

        withSourceDZ = dfMoves.merge(dfUp,
                                     how="inner",
                                     left_on="sourceOA",
                                     right_index=True)
        withBothDZ = withSourceDZ.merge(dfUp,
                                        how="inner",
                                        left_on="destOA",
                                        right_index=True)

        withBothIZ = withBothDZ[["InterZone_x", "InterZone_y", "total"]]
        withBothIZ.columns = ["source_IZ", "dest_IZ", "weight"]
        withBothIZ = withBothIZ.groupby(["source_IZ", "dest_IZ"]).sum()
        withBothIZ = withBothIZ.reset_index()

        withBothDZ = withBothDZ[["DataZone_x", "DataZone_y", "total"]]
        withBothDZ.columns = ["source_DZ", "dest_DZ", "weight"]

        withBothDZ = withBothDZ.groupby(["source_DZ", "dest_DZ"]).sum()
        withBothDZ = withBothDZ.reset_index()

        api.write_table(
            "generated_sns_products/wu03buk_oa_wz_v4_scottish_datazones",
            "wu03buk_oa_wz_v4_scottish_datazones",
            withBothDZ,
        )
        api.write_table(
            "generated_sns_products/wu03buk_oa_wz_v4_scottish_interzones",
            "wu03buk_oa_wz_v4_scottish_interzones",
            withBothIZ,
        )