def main(): # Load current file df_source = pd.read_csv(OUTPUT_FILE) # Get date driver = load_driver(DATA_URL) try: date = load_date(driver) except: raise Exception("Date not found!") # Load dose 1 data url = "https://www.fhi.no/api/chartdata/api/99112" dix = json.loads(requests.get(url).content) df_dose1 = pd.DataFrame(dix, columns=["region", "people_vaccinated"]) # Load dose 2 data url = "https://www.fhi.no/api/chartdata/api/99111" dix = json.loads(requests.get(url).content) df_dose2 = pd.DataFrame(dix, columns=["region", "people_fully_vaccinated"]) # Remove row df_dose1 = df_dose1.loc[~(df_dose2["region"] == "Fylke")] df_dose2 = df_dose2.loc[~(df_dose2["region"] == "Fylke")] # Merge df = df_dose1.merge(df_dose2, on="region", how="left") # Process region column df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING) # Add columns df.loc[:, "date"] = date df.loc[:, "location"] = COUNTRY df.loc[:, "total_vaccinations"] = df.loc[:, "people_fully_vaccinated"] + df.loc[:, "people_vaccinated"] # Add ISO codes df = ISODB().merge(df, country_iso=COUNTRY_ISO) # Concat df_source = df_source.loc[~(df_source.loc[:, "date"] == date)] df = pd.concat([df, df_source]) # Export df = df[[ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ]] df = keep_min_date(df) cols = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df[cols] = df[cols].astype("Int64").fillna(pd.NA) df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())
def main(): # Load current data df_source = pd.read_csv(OUTPUT_FILE) # Load data page_content = requests.get(DATA_URL, headers={ 'User-Agent': 'Custom' }).content soup = BeautifulSoup(page_content, "html.parser") # Get new data boxes = soup.findAll(class_="col-12 col-md-6 col-xl-4") new_data = [] if len(boxes) == 3: for box in boxes: fields = box.findAll(class_="col-12") if len(fields) == 4: region = fields[0].text.strip() if "Vaccines administered" in fields[1].text: total, regional = fields[1].findAll( class_="col-auto text-end") dose_1, dose_2 = list( map(lambda x: int(x.replace(",", "")), regional.text.strip().split("\n"))) new_data.append([region, dose_1, dose_2]) df = pd.DataFrame( new_data, columns=["region", "people_vaccinated", "people_fully_vaccinated"]) # Process df.loc[:, "total_vaccinations"] = df.loc[:, "people_vaccinated"] + df.loc[:, "people_fully_vaccinated"] df.loc[:, "location"] = COUNTRY # Join with date url = "https://covid-vaccinatie.be/en/vaccines-administered.xlsx" df_dates = get_date(url) df = df.merge(df_dates, left_on="region", right_on="Region", how="left") # ISO df = ISODB().merge(df, country_iso=COUNTRY_ISO) # Export region = df_dates.index.tolist() date = df_dates.date.tolist() df_source = df_source.loc[~(df_source["region"].isin(region) & df_source["date"].isin(date))] df = pd.concat([df, df_source]) df = df[[ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ]] cols = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] # Avoid repeating reports df = keep_min_date(df) # Export df[cols] = df[cols].astype("Int64").fillna(pd.NA) df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())
def main(): # Load current data df_source = pd.read_csv(OUTPUT_FILE) # Locate newest pdf html_page = urllib.request.urlopen(DATA_URL) soup = BeautifulSoup(html_page, "html.parser") pdf_path = soup.find('a', text="Download her").get( "href") # Get path to newest pdf # Get preliminary dataframe column_string = { 'dtype': str, 'header': None } # Force dtype to be object because of thousand separator kwargs = { 'pandas_options': column_string, } dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all", **kwargs) # len(dfs_from_pdf) == 8 ? #date = datetime.strptime(pdf_path.split("-")[-2], "%d%m%Y").strftime("%Y-%m-%d") date = get_date(dfs_from_pdf) # Get preliminary dataframe column_string = { 'dtype': str, 'header': None } # Force dtype to be object because of thousand separator kwargs = { 'pandas_options': column_string, } dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all", **kwargs) df = dfs_from_pdf[1] # Hardcoded if df.shape != (11, 7): raise Exception("Shape of table changed!") if not all(region in df[0].tolist() for region in regions): raise Exception("Region missing!") # Drop columns df = df.drop([0, 1, 2, 3, len(df) - 1]) # Rename columns df = df.rename(columns={ 0: "region", 2: "people_vaccinated", 4: "people_fully_vaccinated" }) df = df.astype(str) # Remove numeric 1000-separator df.loc[:, "people_vaccinated"] = df.loc[:, "people_vaccinated"].apply( lambda x: int(x.replace(".", ""))).fillna(0).astype(int) def del_separator(x): if x != 'nan': return int(x.replace(".", "")) else: return 0 df.loc[:, "people_fully_vaccinated"] = df.loc[:, "people_fully_vaccinated"].apply( lambda x: del_separator( x)).astype("Int64") # Process region column df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING) # Get new columns df.loc[:, "total_vaccinations"] = df.loc[:, "people_vaccinated"] + df.loc[:, "people_fully_vaccinated"] df.loc[:, "location"] = COUNTRY df.loc[:, "date"] = date # Add ISO codes df = ISODB().merge(df, country_iso=COUNTRY_ISO) df.loc[df["region"] == "Others", "location_iso"] = COUNTRY_ISO # Concat df_source = df_source.loc[~(df_source.loc[:, "date"] == date)] df = pd.concat([df, df_source]) # Export cols = [ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df = keep_min_date(df[cols])[cols] df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())