예제 #1
0
 def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
     file_map = sources["US_CA-mortality-stratified"]
     map_func = partial(_process_html_file, file_map)
     map_opts = dict(desc="Processing Cache Files", total=len(file_map))
     records = sum(process_map(map_func, file_map.keys(), **map_opts), [])
     assert len(records) > 0, "No records were found"
     return DataFrame.from_records(records)
예제 #2
0
 def test_dry_run_pipeline(self):
     """
     This test loads the real configuration for all sources in a pipeline, and runs them against
     a subset of the metadata for matching of keys. The subset of the metadata is chosen by
     running the provided `test.metadata_query` for the metadata auxiliary table or, if the
     query is not present, a random sample is selected instead.
     """
     list(process_map(_test_data_pipeline, list(get_pipeline_names()), max_workers=2))
예제 #3
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Get all the weather stations with data up until last month from inventory
        year = int(parse_opts.get("year")) if "year" in parse_opts else None
        cur_date = datetime.date(year, 12,
                                 31) if year else datetime.date.today()
        min_date = (cur_date - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_file(
            sources["inventory"]).rename(columns={
                "LAT": "lat",
                "LON": "lon",
                "ELEV(M)": "elevation"
            })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Open the station data as a compressed file
        station_cache = dict()
        with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar:

            # Build the station cache by decompressing all files in memory
            map_iter = stations_tar.getmembers()
            map_func = partial(_extract_station, stations_tar)
            map_opts = dict(desc="Decompressing", total=len(map_iter))
            for station_item in pbar(map(map_func, map_iter), **map_opts):
                station_cache.update(station_item)

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = read_file(sources["geography"])[keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations["lat"].apply(math.radians)
        stations["lon"] = stations["lon"].apply(math.radians)
        metadata["lat"] = metadata["latitude"].apply(math.radians)
        metadata["lon"] = metadata["longitude"].apply(math.radians)

        # Use a manager to handle memory accessed across processes
        manager = Manager()
        station_cache = manager.dict(station_cache)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(_process_location, station_cache, stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = (record for _, record in metadata.iterrows())

        # Bottleneck is network so we can use lots of threads in parallel
        records = process_map(map_func, map_iter, total=len(metadata))

        return concat(records)
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        file_map = sources["US_CDC_states_vaccinations"]

        map_func = partial(_process_cache_file, file_map)
        map_opts = dict(desc="Processing Cache Files", total=len(file_map))
        data = concat(process_map(map_func, file_map.keys(), **map_opts))

        assert len(data) > 0, "No records were found"
        return data
예제 #5
0
    def parse_dataframes(self, dataframes: Dict[Any, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Partition dataframes based on the state the data is for
        df = table_rename(dataframes[0], _column_adapter)
        partitions = (df[df["subregion1_code"] == code]
                      for code in _IBGE_STATES.values())

        # Process each partition in separate threads
        map_opts = dict(desc="Processing Partitions", total=len(_IBGE_STATES))
        return concat(process_map(_process_partition, partitions, **map_opts))
예제 #6
0
    def parse(self, sources: Dict[str, Dict[str, str]],
              aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        file_map = sources["AR_vaccinations"]
        map_func = partial(_process_cache_file, file_map)
        map_opts = dict(desc="Processing Cache Files", total=len(file_map))
        data = concat(process_map(map_func, file_map.keys(), **map_opts))
        assert len(data) > 0, "No records were found"

        # Estimate total doses from first and second doses
        data["total_vaccine_doses_administered"] = (
            data["total_persons_vaccinated"] +
            data["total_persons_fully_vaccinated"])

        data["key"] = "AR_" + data["subregion1_code"]
        return data
예제 #7
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Partition dataframes based on the state the data is for
        partitions = {code: [] for code in _IBGE_STATES.values()}
        for df in dataframes.values():
            df = table_rename(df, _open_data_portal_column_adapter, drop=True)
            apply_func = lambda x: _IBGE_STATES.get(safe_int_cast(x))
            df["subregion1_code"] = df["_state_code"].apply(apply_func)
            for code, group in df.groupby("subregion1_code"):
                partitions[code].append(group)

        # Process each partition in separate threads
        map_opts = dict(desc="Processing Partitions", total=len(partitions))
        map_iter = (concat(chunks) for chunks in partitions.values())
        return concat(process_map(_process_partition, map_iter, **map_opts))
예제 #8
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Use a manager to handle memory accessed across processes
        manager = Manager()

        # Get all the weather stations with data up until last month from inventory
        today = datetime.date.today()
        min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_file(
            sources["inventory"]).rename(columns={
                "LAT": "lat",
                "LON": "lon",
                "ELEV(M)": "elevation"
            })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Open the station data as a compressed file
        with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar:

            # Build the station cache by decompressing all files in memory
            station_cache = manager.dict()
            for member in pbar(stations_tar.getmembers(),
                               desc="Decompressing"):

                if not member.name.endswith(".csv"):
                    continue

                # Read the records from the provided station
                data = read_file(
                    stations_tar.extractfile(member),
                    file_type="csv",
                    usecols=_COLUMN_MAPPING.keys(),
                ).rename(columns=_COLUMN_MAPPING)

                # Fix data types
                data["noaa_station"] = data["noaa_station"].astype(str)
                data["rainfall"] = data["rainfall"].apply(conv_dist)
                data["snowfall"] = data["snowfall"].apply(conv_dist)
                data["dew_point"] = data["dew_point"].apply(conv_temp)
                for temp_type in ("average", "minimum", "maximum"):
                    col = f"{temp_type}_temperature"
                    data[col] = data[col].apply(conv_temp)

                # Compute the relative humidity from the dew point and average temperature
                data["relative_humidity"] = data.apply(
                    lambda x: relative_humidity(x["average_temperature"], x[
                        "dew_point"]),
                    axis=1)

                station_cache[member.name.replace(".csv", "")] = data

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = read_file(sources["geography"])[keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations["lat"].apply(math.radians)
        stations["lon"] = stations["lon"].apply(math.radians)
        metadata["lat"] = metadata["latitude"].apply(math.radians)
        metadata["lon"] = metadata["longitude"].apply(math.radians)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(_process_location, station_cache, stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = (record for _, record in metadata.iterrows())

        # Bottleneck is network so we can use lots of threads in parallel
        records = process_map(map_func, map_iter, total=len(metadata))

        return concat(records)