示例#1
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts):
        # Read all files in the eurostat folder and merge them together
        eurostat_directory = SRC / "data" / "eurostat"
        dataframes = [
            read_file(file_name)
            for file_name in eurostat_directory.glob("*.csv")
        ]
        data = table_merge(dataframes, how="outer").dropna(subset=["key"])

        # Use only keys available in metadata
        return data.merge(aux["metadata"][["key"]], how="inner")
示例#2
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(
            sources[0], error_bad_lines=False, encoding="ISO-8859-1", sep=";"
        ).rename(
            columns={
                "Date":
                "date",
                "Nombre de personnes en soins normaux":
                "current_hospitalized",
                "Nombre de personnes en soins intensifs (sans patients du Grand Est)":
                "current_intensive_care",
                "Nombre de décès - cumulé (sans patients du Grand Est)":
                "deceased",
                "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)":
                "recovered",
                "Nombre de nouvelles personnes testées COVID+ par jour ":
                "tested",
            })

        # Get date in ISO format
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Keep only columns we can provess
        data = data[[
            "date",
            "current_hospitalized",
            "current_intensive_care",
            "deceased",
            "recovered",
            "tested",
        ]]

        # Convert recovered into a number
        data.recovered = data.recovered.apply(
            lambda x: int(x.replace("-", "0")))

        # Compute the daily counts
        data["key"] = "LU"
        data_new = grouped_diff(data[["key", "date", "deceased"]],
                                ["key", "date"])
        data_cum = grouped_cumsum(data[["key", "date", "tested", "recovered"]],
                                  ["key", "date"])
        data_cur = data[[
            "key", "date", "current_hospitalized", "current_intensive_care"
        ]]
        data = data_new.merge(data_cum, how="outer").merge(data_cur,
                                                           how="outer")

        # Output the results
        return data
示例#3
0
 def test_key_build(self):
     skip_keys = ("UA_40", "UA_43")
     metadata = read_file(METADATA_PATH).set_index("key")
     localities = read_file(SRC / "data" /
                            "localities.csv")["locality"].unique()
     for key, record in metadata.iterrows():
         msg = f"{key} does not match region codes in metadata"
         tokens = key.split("_")
         if key in skip_keys:
             continue
         elif len(tokens) == 1:
             self.assertEqual(key, record["country_code"], msg)
         elif key in localities or not isna(record["locality_code"]):
             self.assertEqual(tokens[-1], record["locality_code"], msg)
         elif len(tokens) == 2:
             self.assertEqual(tokens[0], record["country_code"], msg)
             self.assertEqual(tokens[1], record["subregion1_code"], msg)
         elif len(tokens) == 3:
             self.assertEqual(tokens[0], record["country_code"], msg)
             self.assertEqual(tokens[1], record["subregion1_code"], msg)
             self.assertEqual(tokens[2], record["subregion2_code"], msg)
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame:
    data = read_file(
        file_map[date])["vaccination_county_condensed_data"].values.tolist()
    data = DataFrame.from_records(data)

    data = data[data["FIPS"] != "UNK"]
    data = data.assign(
        key="US_" + data["StateAbbr"].str[:2] + "_" + data["FIPS"],
        Series_Complete_Yes=data["Series_Complete_Yes"].fillna(0).astype(int),
    )
    data = table_rename(data, _column_adapter, drop=True)

    data["date"] = date
    return data
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        data = read_file(sources[0], sheet_name=parse_opts.get("sheet_name"))

        # Process the individual sheet
        data = _sheet_processors[parse_opts.get("sheet_name")](data)

        # Fix up the date format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        # Add a key to all the records (state-level only)
        data["key"] = "US_DC"
        return data
示例#6
0
def _read_main_table(path: Path) -> DataFrame:
    return read_file(
        path,
        dtype={
            "country_code": "category",
            "country_name": "category",
            "subregion1_code": "category",
            "subregion1_name": "category",
            "subregion2_code": "category",
            "subregion2_name": "category",
            "3166-1-alpha-2": "category",
            "3166-1-alpha-3": "category",
            "aggregation_level": "category",
        },
    )
示例#7
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0],
                         sheet_name="Antal intensivvårdade per dag").rename(
                             columns={
                                 "Datum_vårdstart": "date",
                                 "Antal_intensivvårdade": "intensive_care"
                             })

        # Get date in ISO format
        data["key"] = "SE"
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%m/%d/%Y"))
        return grouped_cumsum(data, ["key", "date"])
示例#8
0
    def test_derive_localities(self):
        localities = read_file(SRC / "data" / "localities.csv")
        test_data = LOCALITY_TEST_DATA.copy()
        expected = DataFrame.from_records(
            [
                {"key": "BR_RJ_GIG", "date": "2020-01-01", "val": 1},
                {"key": "BR_RJ_GIG", "date": "2020-01-02", "val": 1},
                {"key": "US_GA_ATL", "date": "2020-01-01", "val": 4},
                {"key": "US_GA_ATL", "date": "2020-01-02", "val": 4},
            ]
        )

        columns = test_data.columns
        test_result = derive_localities(localities, test_data)[columns]
        self.assertEqual(test_result.to_csv(index=False), expected.to_csv(index=False))
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0], sheet_name=parse_opts.get("sheet_name"))
        data.columns = data.iloc[1]
        data = table_rename(data.iloc[2:], _column_adapter, drop=True)
        data["date"] = data["date"].astype(str).apply(lambda x: x[:10])
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))
        data = data.dropna(subset=["date"])

        if parse_opts.get("key"):
            data["key"] = parse_opts.get("key")

        return data
示例#10
0
文件: publish.py 项目: barbdowns/data
def subset_latest(output_folder: Path, csv_file: Path) -> DataFrame:
    """ Outputs latest data for each key """
    latest_folder = output_folder / "latest"
    latest_folder.mkdir(exist_ok=True)
    table = read_file(csv_file, low_memory=False)

    # Degenerate case: this table has no date
    if not "date" in table.columns or len(table.date.dropna()) == 0:
        return export_csv(table, latest_folder / csv_file.name)
    else:
        non_null_columns = [
            col for col in table.columns if not col in ("key", "date")
        ]
        table = table.dropna(subset=non_null_columns, how="all")
        table = table.sort_values("date").groupby("key").last().reset_index()
        export_csv(table, latest_folder / csv_file.name)
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0], sheet_name="Antal per dag region").rename(
            columns={"Statistikdatum": "date"})

        # Get date in ISO format
        data.date = data.date.astype(str)

        # Unpivot the regions which are columns
        data.columns = [col.replace("_", " ") for col in data.columns]
        data = data.drop(columns=["Totalt antal fall"]).set_index("date")
        data = pivot_table(data, pivot_name="match_string")

        data["country_code"] = "SE"
        return data.rename(columns={"value": "new_confirmed"})
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0],
                         sheet_name="Antal intensivvårdade per dag").rename(
                             columns={
                                 "Datum_vårdstart": "date",
                                 "Antal_intensivvårdade": "new_intensive_care"
                             })

        # Get date in ISO format
        data["key"] = "SE"
        # The source is actually %m/%d/%Y but pandas silently converts it to date object
        data["date"] = data["date"].astype(str).apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))
        return data
示例#13
0
def _subset_last_days(output_folder: Path, days: int) -> None:
    """ Outputs last N days of data """
    n_days_folder = output_folder / str(days)
    n_days_folder.mkdir(exist_ok=True)
    for csv_file in (output_folder).glob("*.csv"):
        table = read_file(csv_file)

        # Degenerate case: this table has no date
        if not "date" in table.columns or len(table.date.dropna()) == 0:
            export_csv(table, n_days_folder / csv_file.name)
        else:
            last_date = datetime.date.fromisoformat(max(table.date))
            # Since APAC is almost always +1 days ahead, increase the window by 1
            first_date = last_date - datetime.timedelta(days=days + 1)
            export_csv(table[table.date >= first_date.isoformat()],
                       n_days_folder / csv_file.name)
示例#14
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        data = read_file(sources[0],
                         error_bad_lines=False,
                         encoding="ISO-8859-1")
        data = table_rename(
            data,
            {
                "Date":
                "date",
                "Nombre de personnes en soins normaux":
                "current_hospitalized",
                "Nombre de personnes en soins intensifs (sans patients du Grand Est)":
                "current_intensive_care",
                "Nombre de décès - cumulé (sans patients du Grand Est)":
                "total_deceased",
                "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)":
                "new_recovered",
                "Nombre de nouvelles personnes testées COVID+ par jour ":
                "new_tested",
            },
        )

        # Get date in ISO format
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Keep only columns we can provess
        data = data[[
            "date",
            "current_hospitalized",
            "current_intensive_care",
            "total_deceased",
            "new_recovered",
            "new_tested",
        ]]

        # Convert recovered into a number
        data.new_recovered = data.new_recovered.apply(
            lambda x: safe_int_cast(x.replace("-", "0")))

        # Only country-level data is provided
        data["key"] = "LU"

        # Output the results
        return data
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame:
    data = read_file(file_map[date])["vaccination_data"].values.tolist()
    data = DataFrame([list(v.values()) for v in data],
                     columns=list(data[0].keys()))

    data = data.loc[data.Location.isin(us_states)]
    for col in set(_column_adapter.keys()).intersection(data.columns):
        data[col] = data[col].fillna(0).astype(int)

    data["key"] = data["Location"].apply(lambda x: "US"
                                         if x == "US" else "US_" + x[:2])

    data = table_rename(data, _column_adapter, drop=True)

    data["date"] = date

    return data
示例#16
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        # For country level, there is no need to estimate confirmed from tests
        column_adapter_country = dict(_column_adapter)
        column_adapter_country.pop("testsPositifs")

        # Get country level data
        country = _get_country(url_tpl, column_adapter_country)

        # Country level data has totals instead of diffs, so we compute the diffs by hand
        country.sort_values("date", inplace=True)
        country["new_confirmed"] = country["total_confirmed"].diff()
        country.drop(columns=["total_confirmed"], inplace=True)

        # For region level, we can only estimate confirmed from tests
        column_adapter_region = dict(_column_adapter)
        column_adapter_region.pop("casConfirmes")

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter_region,
                                  fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl,
                                      column_adapter_region)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data.sort_values("date")
示例#17
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = (record for _, record in fr_codes.iterrows())

        if parse_opts.get("country"):
            # For country level, there is no need to estimate confirmed from tests
            _column_adapter_2 = dict(_column_adapter)
            _column_adapter_2.pop("testsPositifs")
            data = _get_country(url_tpl, _column_adapter_2)

        else:
            # For region level, we can only estimate confirmed from tests
            _column_adapter_2 = dict(_column_adapter)
            _column_adapter_2.pop("casConfirmes")

            get_region_func = partial(_get_region, url_tpl, _column_adapter_2,
                                      fr_iso_map)
            regions = concat(list(thread_map(get_region_func, regions_iter)))

            get_department_func = partial(_get_department, url_tpl,
                                          _column_adapter_2)
            departments = concat(
                list(
                    thread_map(get_department_func,
                               deps_iter,
                               total=len(fr_codes))))

            data = concat([regions, departments])

        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data
示例#18
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Retrieve the CSV files from https://covid19.isciii.es
        data = (read_file(sources[0],
                          error_bad_lines=False,
                          encoding="ISO-8859-1").rename(
                              columns={
                                  "FECHA": "date",
                                  "CCAA": "subregion1_code",
                                  "Fallecidos": "total_deceased",
                                  "Hospitalizados": "total_hospitalized",
                                  "UCI": "total_intensive_care",
                              }).dropna(subset=["date"]))

        # Confirmed cases are split across 2 columns
        confirmed_columns = ["CASOS", "PCR+"]
        for col in confirmed_columns:
            data[col] = data[col].fillna(0)
        data["total_confirmed"] = data.apply(
            lambda x: sum([x[col] for col in confirmed_columns]), axis=1)

        # Convert dates to ISO format
        data["date"] = data["date"].apply(
            lambda date: datetime_isoformat(date, "%d/%m/%Y"))

        # Keep only the columns we can process
        data = data[[
            "date",
            "subregion1_code",
            "total_confirmed",
            "total_deceased",
            "total_hospitalized",
            "total_intensive_care",
        ]]

        # Derive the key from the subregion code
        data["key"] = "ES_" + data["subregion1_code"]

        # Output the results
        return data
示例#19
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Retrieve the CSV files from https://covid19.isciii.es
        df = (read_file(sources[0],
                        error_bad_lines=False,
                        encoding="ISO-8859-1").rename(
                            columns={
                                "FECHA": "date",
                                "CCAA": "subregion1_code",
                                "Fallecidos": "deceased",
                                "Hospitalizados": "hospitalized",
                                "UCI": "ICU",
                                "Recuperados": "recovered",
                            }).dropna(subset=["date"]))

        # Confirmed cases are split across 2 columns
        confirmed_columns = ["CASOS", "PCR+"]
        for col in confirmed_columns:
            df[col] = df[col].fillna(0)
        df["confirmed"] = df.apply(
            lambda x: sum([x[col] for col in confirmed_columns]), axis=1)

        # Convert dates to ISO format
        df["date"] = df["date"].apply(
            lambda date: datetime_isoformat(date, "%d/%m/%Y"))

        # Reported cases are cumulative, compute the diff
        df = grouped_diff(df, ["subregion1_code", "date"])

        # Add the country code to all records
        df["country_code"] = "ES"

        # Country-wide is the sum of all regions
        country_level = (df.drop(columns=["subregion1_code"]).groupby(
            ["date", "country_code"]).sum().reset_index())
        country_level["subregion1_code"] = None
        df = concat([country_level, df])

        # Output the results
        return df
示例#20
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Retrieve the CSV files from https://covid19.isciii.es
        data = (read_file(sources[0],
                          error_bad_lines=False,
                          encoding="ISO-8859-1").rename(
                              columns={
                                  "FECHA": "date",
                                  "CCAA": "subregion1_code",
                                  "Fallecidos": "deceased",
                                  "Hospitalizados": "hospitalized",
                                  "UCI": "intensive_care",
                              }).dropna(subset=["date"]))

        # Confirmed cases are split across 2 columns
        confirmed_columns = ["CASOS", "PCR+"]
        for col in confirmed_columns:
            data[col] = data[col].fillna(0)
        data["confirmed"] = data.apply(
            lambda x: sum([x[col] for col in confirmed_columns]), axis=1)

        # Convert dates to ISO format
        data["date"] = data["date"].apply(
            lambda date: datetime_isoformat(date, "%d/%m/%Y"))

        # Keep only the columns we can process
        data = data[[
            "date", "subregion1_code", "confirmed", "deceased", "hospitalized",
            "intensive_care"
        ]]

        # Reported cases are cumulative, compute the diff
        data = grouped_diff(data, ["subregion1_code", "date"])

        # Add the country code to all records
        data["country_code"] = "ES"

        # Output the results
        return data
示例#21
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        read_opts = {k: v for k, v in parse_opts.items() if k in READ_OPTS}

        dataframes = {}
        date_start = parse_opts.pop("date_start", None)
        date_end = parse_opts.pop("date_end", None)
        for cache_key, cache_urls in sources.items():

            daily_data = []
            for date, url in cache_urls.items():
                if date_start is not None and date < date_start:
                    continue
                if date_end is not None and date > date_end:
                    continue

                data = read_file(url, **read_opts)
                data["date"] = date
                daily_data.append(data)

            dataframes[cache_key] = concat(daily_data)

        return self.parse_dataframes(dataframes, aux, **parse_opts)
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        metadata = read_file(SRC / "data" / "metadata.csv")
        za = metadata[metadata["country_code"] == "ZA"]
        provinces = za[za["key"].apply(lambda x: len(x.split("_")) == 2)]
        districts = za[za["key"].apply(lambda x: len(x.split("_")) == 3)]
        url_tpl = {opt["name"]: opt["url"] for opt in fetch_opts}
        opts = {"opts": {"ignore_failure": True}}

        fetch_list = []
        for key in provinces["key"]:
            key_ = key[3:].replace("_", "-")
            fetch_list.append({"url": url_tpl["provinces"].format(key=key_), "name": key, **opts})
        for key in districts["key"]:
            key_ = key[3:].replace("_", "-")
            fetch_list.append({"url": url_tpl["districts"].format(key=key_), "name": key, **opts})

        return super().fetch(output_folder, cache, fetch_list, skip_existing=skip_existing)
示例#23
0
    def _parse(file_path: str, sheet_name: str, value_name: str):
        data = read_file(file_path, sheet_name=sheet_name)

        data.columns = [col.replace("NHS ", "").replace(" total", "") for col in data.iloc[1]]
        # Drop Golden Jubilee National Hospital - it has no hospitalizations and does not fit
        # any current matches in metadata.csv.
        data = data.drop(columns=["Golden Jubilee National Hospital"])
        data = data.iloc[2:].rename(columns={"Date": "date"})

        data = pivot_table(data.set_index("date"), pivot_name="match_string")
        data = data.rename(columns={"value": value_name})
        data[value_name] = data[value_name].replace("*", None).apply(safe_float_cast).astype(float)

        # Get date in ISO format
        data.date = data.date.apply(lambda x: x.date().isoformat())

        # Add metadata
        data["key"] = None
        data["country_code"] = "GB"
        data["subregion1_code"] = "SCT"
        l2_mask = data.match_string == "Scotland"
        data.loc[l2_mask, "key"] = "GB_SCT"

        return data
示例#24
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = (record for _, record in fr_codes.iterrows())

        regions = concat(
            list(thread_map(partial(_get_region, fr_iso_map), regions_iter)))
        departments = concat(
            list(thread_map(_get_department, deps_iter, total=len(fr_codes))))

        data = concat([regions, departments])
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data
示例#25
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        sheets = []
        sheet_processors = {
            "Trends": TexasDataSource._parse_trends,
            "Tests by day": TexasDataSource._parse_tests,
            "Hospitalization by Day": TexasDataSource._parse_hospitalized,
        }
        for sheet_name, sheet_processor in sheet_processors.items():
            df = sheet_processor(read_file(sources[0], sheet_name=sheet_name))
            df = df.dropna(subset=["date"])
            df.date = df.date.astype(str)
            df.date = df.date.apply(
                lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
            sheets.append(df)

        data = table_multimerge(sheets, how="outer")
        for col in data.columns:
            if col != "date":
                data[col] = data[col].apply(safe_float_cast).astype(float)

        data["key"] = "US_TX"
        return data
示例#26
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Any, Callable, Dict, List
from pandas import DataFrame, isna
from unidecode import unidecode
from lib.cast import age_group, safe_int_cast
from lib.constants import SRC
from lib.io import read_file
from lib.utils import get_or_default

STRATIFIED_VALUES = read_file(SRC / "data" /
                              "stratified_values.csv").set_index("type")


def _default_adapter_factory(key: str) -> Callable[[str], str]:

    mapping = {"other": f"{key}_other", "unknown": f"{key}_unknown"}
    for value, alias in STRATIFIED_VALUES.loc[key].set_index(
            "value")["alias"].iteritems():
        mapping[value] = value
        if not isna(alias):
            mapping[alias] = value

    def default_adapter(value: str):
        if isna(value):
            return mapping["unknown"]
        value = re.sub(r"[\s\-]", "_", unidecode(str(value).lower()))
示例#27
0
from lib.utils import drop_na_records


def snake_to_camel_case(txt: str) -> str:
    """ Used to convert V2 column names to V1 column names for backwards compatibility """
    return re.sub(r"_(\w)", lambda m: m.group(1).upper(), txt.capitalize())


if __name__ == "__main__":

    # Create the folder which will be published
    public_folder = SRC / ".." / "output" / "public"
    public_folder.mkdir(exist_ok=True, parents=True)

    # Create the v1 data.csv file
    main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False)
    data = main_table[main_table.aggregation_level < 2]
    rename_columns = {
        "date": "Date",
        "key": "Key",
        "country_code": "CountryCode",
        "country_name": "CountryName",
        "subregion1_code": "RegionCode",
        "subregion1_name": "RegionName",
        "total_confirmed": "Confirmed",
        "total_deceased": "Deaths",
        "latitude": "Latitude",
        "longitude": "Longitude",
        "population": "Population",
    }
    data = data[rename_columns.keys()].rename(columns=rename_columns)
示例#28
0
# This script must be run from /src
sys.path.append(os.getcwd())
from lib.io import read_file
from lib.utils import ROOT

# Parse arguments from the command line
argparser = ArgumentParser()
argparser.add_argument("country_code", type=str)
argparser.add_argument("--nuts-level", type=int, default=2)
argparser.add_argument("--dc-api-key",
                       type=str,
                       default=os.environ["DATACOMMONS_API_KEY"])
args = argparser.parse_args()

# Get the country name
aux = read_file(ROOT / "src" / "data" / "metadata.csv").set_index("key")
country_name = aux.loc[args.country_code, "country_name"]

# Convert 2-letter to 3-letter country code
iso_codes = read_file(ROOT / "src" / "data" /
                      "country_codes.csv").set_index("key")
country_code_alpha_3 = iso_codes.loc[args.country_code, "3166-1-alpha-3"]

dc.set_api_key(args.dc_api_key)
country = "country/{}".format(country_code_alpha_3)
nuts_name = "EurostatNUTS{}".format(args.nuts_level)
regions = dc.get_places_in([country], nuts_name)[country]
names = dc.get_property_values(regions, "name")
for key, name in names.items():
    region_name = name[0]
    region_code = key.split("/")[-1][2:]
示例#29
0
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame:
    data = table_rename(read_file(file_map[date]), _column_adapter, drop=True)
    data["subregion1_code"] = data["subregion1_code"].apply(
        lambda x: _ISO_CODE_MAP.get(numeric_code_as_string(x, 2) or "00"))
    data["date"] = date
    return data
示例#30
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Use a manager to handle memory accessed across processes
        manager = Manager()

        # Get all the weather stations with data up until last month from inventory
        today = datetime.date.today()
        min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_file(
            sources["inventory"]).rename(columns={
                "LAT": "lat",
                "LON": "lon",
                "ELEV(M)": "elevation"
            })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Open the station data as a compressed file
        with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar:

            # Build the station cache by decompressing all files in memory
            station_cache = manager.dict()
            for member in pbar(stations_tar.getmembers(),
                               desc="Decompressing"):

                if not member.name.endswith(".csv"):
                    continue

                # Read the records from the provided station
                data = read_file(
                    stations_tar.extractfile(member),
                    file_type="csv",
                    usecols=_COLUMN_MAPPING.keys(),
                ).rename(columns=_COLUMN_MAPPING)

                # Fix data types
                data["noaa_station"] = data["noaa_station"].astype(str)
                data["rainfall"] = data["rainfall"].apply(conv_dist)
                data["snowfall"] = data["snowfall"].apply(conv_dist)
                data["dew_point"] = data["dew_point"].apply(conv_temp)
                for temp_type in ("average", "minimum", "maximum"):
                    col = f"{temp_type}_temperature"
                    data[col] = data[col].apply(conv_temp)

                # Compute the relative humidity from the dew point and average temperature
                data["relative_humidity"] = data.apply(
                    lambda x: relative_humidity(x["average_temperature"], x[
                        "dew_point"]),
                    axis=1)

                station_cache[member.name.replace(".csv", "")] = data

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = read_file(sources["geography"])[keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations["lat"].apply(math.radians)
        stations["lon"] = stations["lon"].apply(math.radians)
        metadata["lat"] = metadata["latitude"].apply(math.radians)
        metadata["lon"] = metadata["longitude"].apply(math.radians)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(_process_location, station_cache, stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = (record for _, record in metadata.iterrows())

        # Bottleneck is network so we can use lots of threads in parallel
        records = process_map(map_func, map_iter, total=len(metadata))

        return concat(records)