def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: file_map = sources["US_CA-mortality-stratified"] map_func = partial(_process_html_file, file_map) map_opts = dict(desc="Processing Cache Files", total=len(file_map)) records = sum(process_map(map_func, file_map.keys(), **map_opts), []) assert len(records) > 0, "No records were found" return DataFrame.from_records(records)
def test_dry_run_pipeline(self): """ This test loads the real configuration for all sources in a pipeline, and runs them against a subset of the metadata for matching of keys. The subset of the metadata is chosen by running the provided `test.metadata_query` for the metadata auxiliary table or, if the query is not present, a random sample is selected instead. """ list(process_map(_test_data_pipeline, list(get_pipeline_names()), max_workers=2))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Get all the weather stations with data up until last month from inventory year = int(parse_opts.get("year")) if "year" in parse_opts else None cur_date = datetime.date(year, 12, 31) if year else datetime.date.today() min_date = (cur_date - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_file( sources["inventory"]).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Open the station data as a compressed file station_cache = dict() with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar: # Build the station cache by decompressing all files in memory map_iter = stations_tar.getmembers() map_func = partial(_extract_station, stations_tar) map_opts = dict(desc="Decompressing", total=len(map_iter)) for station_item in pbar(map(map_func, map_iter), **map_opts): station_cache.update(station_item) # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = read_file(sources["geography"])[keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations["lat"].apply(math.radians) stations["lon"] = stations["lon"].apply(math.radians) metadata["lat"] = metadata["latitude"].apply(math.radians) metadata["lon"] = metadata["longitude"].apply(math.radians) # Use a manager to handle memory accessed across processes manager = Manager() station_cache = manager.dict(station_cache) # Make sure the stations and the cache are sent to each function call map_func = partial(_process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = (record for _, record in metadata.iterrows()) # Bottleneck is network so we can use lots of threads in parallel records = process_map(map_func, map_iter, total=len(metadata)) return concat(records)
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: file_map = sources["US_CDC_states_vaccinations"] map_func = partial(_process_cache_file, file_map) map_opts = dict(desc="Processing Cache Files", total=len(file_map)) data = concat(process_map(map_func, file_map.keys(), **map_opts)) assert len(data) > 0, "No records were found" return data
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Partition dataframes based on the state the data is for df = table_rename(dataframes[0], _column_adapter) partitions = (df[df["subregion1_code"] == code] for code in _IBGE_STATES.values()) # Process each partition in separate threads map_opts = dict(desc="Processing Partitions", total=len(_IBGE_STATES)) return concat(process_map(_process_partition, partitions, **map_opts))
def parse(self, sources: Dict[str, Dict[str, str]], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: file_map = sources["AR_vaccinations"] map_func = partial(_process_cache_file, file_map) map_opts = dict(desc="Processing Cache Files", total=len(file_map)) data = concat(process_map(map_func, file_map.keys(), **map_opts)) assert len(data) > 0, "No records were found" # Estimate total doses from first and second doses data["total_vaccine_doses_administered"] = ( data["total_persons_vaccinated"] + data["total_persons_fully_vaccinated"]) data["key"] = "AR_" + data["subregion1_code"] return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Partition dataframes based on the state the data is for partitions = {code: [] for code in _IBGE_STATES.values()} for df in dataframes.values(): df = table_rename(df, _open_data_portal_column_adapter, drop=True) apply_func = lambda x: _IBGE_STATES.get(safe_int_cast(x)) df["subregion1_code"] = df["_state_code"].apply(apply_func) for code, group in df.groupby("subregion1_code"): partitions[code].append(group) # Process each partition in separate threads map_opts = dict(desc="Processing Partitions", total=len(partitions)) map_iter = (concat(chunks) for chunks in partitions.values()) return concat(process_map(_process_partition, map_iter, **map_opts))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Use a manager to handle memory accessed across processes manager = Manager() # Get all the weather stations with data up until last month from inventory today = datetime.date.today() min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_file( sources["inventory"]).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Open the station data as a compressed file with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar: # Build the station cache by decompressing all files in memory station_cache = manager.dict() for member in pbar(stations_tar.getmembers(), desc="Decompressing"): if not member.name.endswith(".csv"): continue # Read the records from the provided station data = read_file( stations_tar.extractfile(member), file_type="csv", usecols=_COLUMN_MAPPING.keys(), ).rename(columns=_COLUMN_MAPPING) # Fix data types data["noaa_station"] = data["noaa_station"].astype(str) data["rainfall"] = data["rainfall"].apply(conv_dist) data["snowfall"] = data["snowfall"].apply(conv_dist) data["dew_point"] = data["dew_point"].apply(conv_temp) for temp_type in ("average", "minimum", "maximum"): col = f"{temp_type}_temperature" data[col] = data[col].apply(conv_temp) # Compute the relative humidity from the dew point and average temperature data["relative_humidity"] = data.apply( lambda x: relative_humidity(x["average_temperature"], x[ "dew_point"]), axis=1) station_cache[member.name.replace(".csv", "")] = data # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = read_file(sources["geography"])[keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations["lat"].apply(math.radians) stations["lon"] = stations["lon"].apply(math.radians) metadata["lat"] = metadata["latitude"].apply(math.radians) metadata["lon"] = metadata["longitude"].apply(math.radians) # Make sure the stations and the cache are sent to each function call map_func = partial(_process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = (record for _, record in metadata.iterrows()) # Bottleneck is network so we can use lots of threads in parallel records = process_map(map_func, map_iter, total=len(metadata)) return concat(records)