def timeseries(self) -> TimeseriesDataset: """Build TimeseriesDataset from this data source.""" if set(self.INDEX_FIELD_MAP.keys()) != set( TimeseriesDataset.INDEX_FIELDS): raise ValueError("Index fields must match") return TimeseriesDataset.from_source( self, fill_missing_state=self.FILL_MISSING_STATE_LEVEL_DATA)
def test_summarize_timeseries_fields_with_some_real_data(): data_source = CovidCountyDataDataSource.local() ts = TimeseriesDataset.from_source(data_source) summary = summarize_timeseries_fields( ts.data.loc[lambda df: df[CommonFields.FIPS].str.startswith("06")]) assert not summary.empty cases_summary = summary.loc[("06025", "cases"), :] assert summary.loc[("06025", "cases"), "max_value"] > 7000 assert summary.loc[("06025", "cases"), "max_date"] > pd.to_datetime("2020-08-01") assert summary.loc[("06025", "cases"), "largest_delta_date"] > pd.to_datetime("2020-04-01") assert cases_summary["has_value"] == True assert cases_summary["num_observations"] > 100
def test_expected_field_in_sources(data_source_cls): data_source = data_source_cls.local() ts = TimeseriesDataset.from_source(data_source) # Extract the USA data from the raw DF. Replace this with cleaner access when the DataSource makes it easy. rename_columns = {source: common for common, source in data_source.all_fields_map().items()} renamed_data = data_source.data.rename(columns=rename_columns) usa_data = renamed_data.loc[renamed_data["country"] == "USA"] assert not usa_data.empty states = set(usa_data["state"]) if data_source.SOURCE_NAME == "NHA": assert states == {"NV"} else: good_state = set() for state in states: if re.fullmatch(r"[A-Z]{2}", state): good_state.add(state) else: logging.info(f"Ignoring {state} in {data_source.SOURCE_NAME}") assert len(good_state) >= 48
def timeseries(self) -> "TimeseriesDataset": """Builds generic beds dataset""" return TimeseriesDataset.from_source(self)