def download_data(): try: req_page = _get_response(DATA_PAGE) except Exception as e: raise Exception(f"Could not get web page content: {e}") report_doc = html.document_fromstring(req_page.content.decode("utf-8")) xlsx_el = report_doc.xpath('.//a[contains(@href,".xlsx")]/@href') csv_el = report_doc.xpath('.//a[contains(@href,".csv")]/@href') links = csv_el + xlsx_el if not links: raise Exception("No link to pdf found!") current_dt = datetime.date.today().isoformat() for link in links: link_get = requests.get(link) file_name = link.split("/")[-1] with open( os.path.join(DOCS_DAILY_FOLDER, f"{file_name}"), 'wb' ) as f: f.write(link_get.content)
def download_pdf(): try: req_page = _get_response(REPORT_URL) except Exception as e: raise Exception(f"Could not get web page content: {e}") # "Meest recent epidemiologische update" report_doc = lxml.html.document_fromstring( req_page.content.decode("utf-8")) pdf_el = report_doc.xpath('.//a[@title="Meest recente update.pdf"]/@href') pdfs = pdf_el if not pdfs: raise Exception("No link to pdf found!") try: os.makedirs(DAILY_FOLDER) except FileExistsError as e: logger.info(f"{DAILY_FOLDER} already exists, no need to create folder") pass for pdf in pdfs: pdf_url = pdf pdf_dt = datetime.date.today().isoformat() pdf_url_get = requests.get(pdf_url) pdf_name = pdf.split("/")[-1] with open(os.path.join(DAILY_FOLDER, f"{pdf_name}"), 'wb') as f: f.write(pdf_url_get.content)
def extract_table(self): """Load data table from web page """ try: req_dfs = pd.read_html(self.req.content, flavor='lxml') except: self.req = _get_response(WALES_REPORT_URL_ALT) req_dfs = pd.read_html(self.req.content, flavor='lxml') if not req_dfs: raise Exception("Could not find data table in webpage") self.df = req_dfs[0] self.df.columns = ["nuts_3", "new_cases", "cases"] self.df = self.df[1:] self.df["nuts_3"] = self.df.nuts_3.apply( lambda x: x.replace(" ", " ") if isinstance(x, str) else x) logger.info("cases:\n", self.df)
def extract_table(self): """Load data table from web page """ try: req_dfs = pd.read_html( self.req.content, flavor='lxml' ) except: self.req = _get_response(WALES_REPORT_URL_ALT) req_dfs = pd.read_html( self.req.content, flavor='lxml' ) if not req_dfs: raise Exception("Could not find data table in webpage") self.df = req_dfs[0] self.df.columns = ["authority", "previous_day_cases", "new_cases", "cases"] self.df = self.df[1:] logger.info("cases:\n", self.df)
def download_pdf(a_text): """ a_text can be "Recentste epidemiologische update" or "Wekelijks epidemiologisch bulletin" check the webpage https://covid-19.sciensano.be/nl/covid-19-epidemiologische-situatie for more updated versions of texts of the two buttons. """ try: req_page = _get_response(REPORT_URL) except Exception as e: raise Exception(f"Could not get web page content: {e}") # "Meest recent epidemiologische update" report_doc = html.document_fromstring(req_page.content.decode("utf-8")) pdf_el = report_doc.xpath(f'.//a[contains(., "{a_text}")]/@href') pdfs = pdf_el if not pdfs: raise Exception("No link to pdf found!") try: os.makedirs(DOCS_DAILY_FOLDER) except FileExistsError as e: logger.info( f"{DOCS_DAILY_FOLDER} already exists, no need to create folder") pass for pdf in pdfs: if not pdf.startswith("http"): pdf = "https://covid-19.sciensano.be" + pdf pdf_url = pdf pdf_dt = datetime.date.today().isoformat() pdf_url_get = requests.get(pdf_url) pdf_name = pdf.split("/")[-1] with open(os.path.join(DOCS_DAILY_FOLDER, f"{pdf_name}"), 'wb') as f: f.write(pdf_url_get.content)
import requests from utils import get_response as _get_response logging.basicConfig() logger = logging.getLogger("covid-eu-data.download.dk") # REPORT_URL = "https://www.ssi.dk/aktuelt/sygdomsudbrud/coronavirus/covid-19-i-danmark-epidemiologisk-overvaagningsrapport" REPORT_URL = "https://www.ssi.dk/sygdomme-beredskab-og-forskning/sygdomsovervaagning/c/covid19-overvaagning" DAILY_FOLDER = os.path.join("documents", "daily", "dk") if __name__ == "__main__": try: req_page = _get_response(REPORT_URL) except Exception as e: raise Exception(f"Could not get web page content: {e}") # https://files.ssi.dk/COVID19-overvaagningsrapport-22032020 re_pdf = re.compile(r'href="(https://files.ssi.dk/COVID19-.*?)"') pdfs = list(set(re_pdf.findall(req_page.content.decode("utf-8")))) if not pdfs: raise Exception("No link to pdf found!") try: os.makedirs(DAILY_FOLDER) except FileExistsError as e: logger.info(f"{DAILY_FOLDER} already exists, no need to create folder") pass
DAILY_FOLDER = os.path.join("documents", "daily", "dk") CACHE_DAILY_FOLDER = os.path.join("cache", "daily", "dk") os.makedirs(CACHE_DAILY_FOLDER, exist_ok=True) def download_url(url, save_path, chunk_size=128): r = requests.get(url, stream=True) with open(save_path, 'wb') as fd: for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk) if __name__ == "__main__": try: req_page = _get_response(DATA_PAGE_URL) except Exception as e: raise Exception(f"Could not get web page content: {e}") # https://files.ssi.dk/covid19/overvagning/data/data-epidemiologiske-rapport-12112020-ql82 re_zip = re.compile( r'href="(https://files.ssi.dk/covid19/overvagning/data/data-epidemiologiske-rapport.*?)"' ) zip_paths = list(set(re_zip.findall(req_page.content.decode("utf-8")))) if not zip_paths: raise Exception("No link to pdf found!") re_date = re.compile(r'rapport-(.*?)-')