def __parse_df_from_2021_01_18(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: df = df.drop("Unnamed: 0", axis=1) df_iterrows = df.iterrows() for row in df_iterrows: _, (title, *data) = row if type(title) == str and re.match("^(nhs )?region of residence$", title.lower()): break for row in df_iterrows: _, (location, *data) = row if type(location) == float and math.isnan(location): continue if location == "Data quality notes:": break dose_1, dose_2, cumulative = filter(lambda d: not math.isnan(d), data) if re.match(r"^(Total\d?|England)$", location): location = ALL_LOCATIONS else: location = Location(location) yield Vaccinated(source, dose_1, Slice(location=location, dose=Dose.DOSE_1)) yield Vaccinated(source, dose_2, Slice(location=location, dose=Dose.DOSE_2)) yield Vaccinated(source, cumulative, Slice(location=location, dose=Dose.ALL))
def parse(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: # Data overrides. Some data formats are only used once, and not worth writing parsers for. if source.data_date == date(2021, 1, 7) and source.period == "weekly": return [ Vaccinated(source, 438075, Slice(Dose.DOSE_1, UNDER_80S, ALL_LOCATIONS)), Vaccinated(source, 13567, Slice(Dose.DOSE_2, UNDER_80S, ALL_LOCATIONS)), Vaccinated(source, 654810, Slice(Dose.DOSE_1, OVER_80S, ALL_LOCATIONS)), Vaccinated(source, 6414, Slice(Dose.DOSE_2, OVER_80S, ALL_LOCATIONS)), ] elif source.data_date == date(2020, 12, 31) and source.period == "weekly": return [ Vaccinated(source, 261561, Slice(Dose.DOSE_1, UNDER_80S, ALL_LOCATIONS)), Vaccinated(source, 0, Slice(Dose.DOSE_2, UNDER_80S, ALL_LOCATIONS)), Vaccinated(source, 524439, Slice(Dose.DOSE_1, OVER_80S, ALL_LOCATIONS)), Vaccinated(source, 0, Slice(Dose.DOSE_2, OVER_80S, ALL_LOCATIONS)), ] if source.period == "daily": if source.data_date >= date(2021, 1, 18): return __parse_df_from_2021_01_18(source, df) else: return __parse_df_earliest(source, df) elif source.period == "weekly": return __parse_df_weekly(source, df) else: raise AssertionError()
def __parse_df_earliest(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: df = df.drop("Unnamed: 0", axis=1) for row in df.iterrows(): _, (title, *data) = row if type(title) == str and " to " in title and len(title.split()) == 7: dose = Dose.ALL elif type(title) == str and title.strip().lower() == "of which, 1st dose": dose = Dose.DOSE_1 elif type(title) == str and title.strip().lower() == "of which, 2nd dose": dose = Dose.DOSE_2 else: continue vaccinated = data[1] yield Vaccinated(source, vaccinated, Slice(dose=dose))
def __parse_df_weekly(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: def is_start(cell) -> bool: return type(cell) == str and (cell.lower() == "region of residence" or cell.lower() == "nhs region of residence") def is_end(cell) -> bool: return type(cell) == str and cell.lower() == "data quality notes:" def is_nan(cell) -> bool: return type(cell) == float and math.isnan(cell) a = df.to_numpy() # Trim. (start_y, ), (start_x, ) = np.where(np.vectorize(is_start)(a)) (end_y, ), (_, ) = np.where(np.vectorize(is_end)(a)) a = a[start_y:end_y, start_x:] # Remove NaNs. is_nans = np.vectorize(is_nan)(a) a = a[:, ~np.all(is_nans, axis=0)] a = a[~np.all(is_nans, axis=1), :] # Fill in dose row. filled_in_doses = [] current_dose = None for population in a[0, 1:]: if not is_nan(population): current_dose = population filled_in_doses.append(current_dose) a[0, 1:] = filled_in_doses vaccinated_by_slice: DefaultDict[Slice, int] = defaultdict(int) for y in range(2, a.shape[0]): for x in range(1, a.shape[1]): dose = a[0, x] group = a[1, x] location = a[y, 0] vaccinated = a[y, x] ignore = [ # Ignore population estimates. "population estimates", # Ignore precalculated % "% who have had at least 1 dose", "% who have had both doses", # Ignore dose summaries. "total 1st doses", "total 2nd doses", ] if any(map(lambda d: d in dose.lower(), ignore)): continue if type(group) == str and "percent of all" in group.lower(): # Ignore percentage reports. continue is_dose_and_group_all = "cumulative total doses to date" in dose.lower( ) if dose in ["1st dose", "1st dose5"]: dose = Dose.DOSE_1 elif dose in ["2nd dose", "2nd dose5"]: dose = Dose.DOSE_2 elif is_dose_and_group_all: dose = Dose.ALL else: raise AssertionError( f"Unexpected dose {dose} in source {source}") if is_dose_and_group_all: group = ALL_AGES else: group = Group.from_csv_str(group) if re.match(r"^Total\d?$", location): location = ALL_LOCATIONS else: location = Location(location) vaccinated_by_slice[Slice(dose, group, location)] += vaccinated for slice_, vaccinated in vaccinated_by_slice.items(): yield Vaccinated(source, vaccinated, slice_)
def add_extrapolations(vaccinated: List[Vaccinated]) -> Iterable[Vaccinated]: import streamlit as st assert all(v.slice.location == ALL_LOCATIONS for v in vaccinated) assert all(v.slice.group == ALL_AGES for v in vaccinated) dose_1_vaccinations = { v.source.real_date: v.vaccinated for v in vaccinated if v.slice.dose == Dose.DOSE_1 } dose_1_vaccinations_dates = list(sorted(dose_1_vaccinations.keys())) dose_1_new_vaccinations = { dose_1_vaccinations_dates[0]: dose_1_vaccinations[dose_1_vaccinations_dates[0]] } for d1, d2 in zip(dose_1_vaccinations_dates, dose_1_vaccinations_dates[1:]): st.write(d1, d2, dose_1_vaccinations[d1], dose_1_vaccinations[d2]) dose_1_new_vaccinations[ d2] = dose_1_vaccinations[d2] - dose_1_vaccinations[d1] dose_1_new_vaccinations = defaultdict(int, dose_1_new_vaccinations) st.write({str(k): v for k, v in dose_1_new_vaccinations.items()}) date_latest = max(v.source.real_date for v in vaccinated) this_week_vaccinations = sum(v.vaccinated for v in vaccinated if v.source.real_date == date_latest) last_week_vaccinations = sum(v.vaccinated for v in vaccinated if v.source.real_date == date_latest - timedelta(weeks=1)) vaccination_rate = this_week_vaccinations - last_week_vaccinations st.write("last week", last_week_vaccinations) st.write("this week", this_week_vaccinations) st.write("vaccination rate", vaccination_rate) cumulative_dose_1_vaccinations = next( v.vaccinated for v in vaccinated if v.source.real_date == date_latest and v.slice.dose == Dose.DOSE_1) cumulative_dose_2_vaccinations = next( v.vaccinated for v in vaccinated if v.source.real_date == date_latest and v.slice.dose == Dose.DOSE_2) total_population = population.total_population() dose_2_vaccinations_required = 0 for day in range(1, 365): current_date = date_latest + timedelta(days=day) new_vaccinations = int(vaccination_rate / 7) dose_2_vaccinations_required += dose_1_new_vaccinations[current_date - timedelta( weeks=12)] dose_2_vaccinations = min(max(0, dose_2_vaccinations_required), new_vaccinations) dose_1_vaccinations = new_vaccinations - dose_2_vaccinations dose_1_vaccinations = min( dose_1_vaccinations, total_population - cumulative_dose_1_vaccinations) if dose_1_vaccinations + dose_2_vaccinations < new_vaccinations: dose_2_vaccinations += new_vaccinations - (dose_1_vaccinations + dose_2_vaccinations) dose_2_vaccinations = min( dose_2_vaccinations, total_population - cumulative_dose_2_vaccinations) assert dose_1_vaccinations >= 0 assert dose_2_vaccinations >= 0 cumulative_dose_2_vaccinations += dose_2_vaccinations cumulative_dose_1_vaccinations += dose_1_vaccinations dose_2_vaccinations_required -= dose_2_vaccinations dose_1_new_vaccinations[ current_date] = dose_1_vaccinations - dose_2_vaccinations yield Vaccinated( source=Source("", current_date, current_date, "weekly"), slice=Slice(dose=Dose.DOSE_1), vaccinated=cumulative_dose_1_vaccinations, extrapolated=True, ) yield Vaccinated( source=Source("", current_date, current_date, "weekly"), slice=Slice(dose=Dose.DOSE_2), vaccinated=cumulative_dose_2_vaccinations, extrapolated=True, ) yield from vaccinated
def deaggregate_with_interpolation( aggregate: Vaccinated, dim: str, vaccinated: List[Vaccinated]) -> Iterable[Vaccinated]: other_dims = [d for d in __SLICE_DIMS if d != dim] vaccinated_weekly = [ v for v in vaccinated if v.source.period == "weekly" if not getattr(v.slice, dim).is_all() and all( getattr(v.slice, other_dim) == getattr(aggregate.slice, other_dim) for other_dim in other_dims) ] if len(vaccinated_weekly) < 2: print(f"Failed to interpolate " f"{aggregate.slice} {aggregate.source.real_date} " f"with {len(vaccinated_weekly)} samples") yield from [] return dates: List[date] = list( sorted( {v.source.real_date for v in vaccinated_weekly}, key=lambda d: abs((d - aggregate.source.real_date).days), )) dates = dates[:2] dates = list(sorted(dates)) dim_date_vaccinated = [(getattr(v.slice, dim), v.source.real_date, v.vaccinated) for v in vaccinated_weekly if v.source.real_date in dates] for dim_value in { getattr(v.slice, dim) for v in vaccinated_weekly if v.source.real_date in dates }: ratio0 = sum( v for d, ddate, v in dim_date_vaccinated if ddate == dates[0] and d == dim_value) / sum( v for _, ddate, v in dim_date_vaccinated if ddate == dates[0]) ratio1 = sum( v for d, ddate, v in dim_date_vaccinated if ddate == dates[1] and d == dim_value) / sum( v for _, ddate, v in dim_date_vaccinated if ddate == dates[1]) date_progress = (aggregate.source.data_date - dates[0]).days / (dates[1] - dates[0]).days date_progress = max(0.0, min(1.0, date_progress)) ratio = ratio0 + (ratio1 - ratio0) * date_progress new_vaccinated = int(aggregate.vaccinated * ratio) assert new_vaccinated >= 0, ( dim, dim_value, ratio, ratio0, ratio1, dates, aggregate.source.real_date, ) yield Vaccinated( source=aggregate.source, vaccinated=new_vaccinated, slice=replace(aggregate.slice, **{dim: dim_value}), interpolated=True, )