def process_location(df: pd.DataFrame, monotonic_check: bool = True) -> pd.DataFrame: # Only report up to previous day to avoid partial reporting df = df.assign(date=pd.to_datetime(df.date, dayfirst=True)) df = df[df.date.dt.date < datetime.now().date()] # Default columns for second doses if "people_vaccinated" not in df: df = df.assign(people_vaccinated=pd.NA) df.people_vaccinated = df.people_vaccinated.astype("Int64") if "people_fully_vaccinated" not in df: df = df.assign(people_fully_vaccinated=pd.NA) df.people_fully_vaccinated = df.people_fully_vaccinated.astype("Int64") # Avoid decimals cols = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df[cols] = df[cols].astype(float).astype("Int64").fillna(pd.NA) # Order columns and rows usecols = [ "location", "date", "vaccine", "source_url", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df = df[usecols] df = df.sort_values(by="date") # Sanity checks country_df_sanity_checks(df, monotonic_check=monotonic_check) # Strip df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Date format df = df.assign(date=df.date.dt.strftime("%Y-%m-%d")) return df
def read_csv_and_check(filepath): # Read try: df = pd.read_csv( filepath, # parse_dates=["date"], # dayfirst=True ) except Exception: raise ValueError(f"Check the spreadsheet corresponding to {filepath}") location = df.loc[:, "location"].unique() # Date check try: df = df.assign(date=pd.to_datetime(df["date"], format="%Y-%m-%d")) except Exception: raise ValueError(f"{location} -- Invalid date format! Should be %Y-%m-%d. Check {df.date}.") if not df.date.is_monotonic: raise ValueError(f"{location} -- Check that date field is monotonically increasing.") # Checks country_df_sanity_checks(df) return df