예제 #1
0
def download_data():

    try:
        req_page = _get_response(DATA_PAGE)
    except Exception as e:
        raise Exception(f"Could not get web page content: {e}")

    report_doc = html.document_fromstring(req_page.content.decode("utf-8"))
    xlsx_el = report_doc.xpath('.//a[contains(@href,".xlsx")]/@href')
    csv_el = report_doc.xpath('.//a[contains(@href,".csv")]/@href')
    links = csv_el + xlsx_el

    if not links:
        raise Exception("No link to pdf found!")

    current_dt = datetime.date.today().isoformat()

    for link in links:
        link_get = requests.get(link)
        file_name = link.split("/")[-1]

        with open(
            os.path.join(DOCS_DAILY_FOLDER, f"{file_name}"),
            'wb'
        ) as f:
            f.write(link_get.content)
예제 #2
0
def download_pdf():
    try:
        req_page = _get_response(REPORT_URL)
    except Exception as e:
        raise Exception(f"Could not get web page content: {e}")

    # "Meest recent epidemiologische update"
    report_doc = lxml.html.document_fromstring(
        req_page.content.decode("utf-8"))
    pdf_el = report_doc.xpath('.//a[@title="Meest recente update.pdf"]/@href')
    pdfs = pdf_el

    if not pdfs:
        raise Exception("No link to pdf found!")

    try:
        os.makedirs(DAILY_FOLDER)
    except FileExistsError as e:
        logger.info(f"{DAILY_FOLDER} already exists, no need to create folder")
        pass

    for pdf in pdfs:
        pdf_url = pdf
        pdf_dt = datetime.date.today().isoformat()

        pdf_url_get = requests.get(pdf_url)
        pdf_name = pdf.split("/")[-1]

        with open(os.path.join(DAILY_FOLDER, f"{pdf_name}"), 'wb') as f:
            f.write(pdf_url_get.content)
예제 #3
0
    def extract_table(self):
        """Load data table from web page
        """
        try:
            req_dfs = pd.read_html(self.req.content, flavor='lxml')
        except:
            self.req = _get_response(WALES_REPORT_URL_ALT)
            req_dfs = pd.read_html(self.req.content, flavor='lxml')

        if not req_dfs:
            raise Exception("Could not find data table in webpage")

        self.df = req_dfs[0]

        self.df.columns = ["nuts_3", "new_cases", "cases"]
        self.df = self.df[1:]
        self.df["nuts_3"] = self.df.nuts_3.apply(
            lambda x: x.replace("  ", " ") if isinstance(x, str) else x)
        logger.info("cases:\n", self.df)
예제 #4
0
    def extract_table(self):
        """Load data table from web page
        """
        try:
            req_dfs = pd.read_html(
                self.req.content, flavor='lxml'
            )
        except:
            self.req = _get_response(WALES_REPORT_URL_ALT)
            req_dfs = pd.read_html(
                self.req.content, flavor='lxml'
            )

        if not req_dfs:
            raise Exception("Could not find data table in webpage")

        self.df = req_dfs[0]

        self.df.columns = ["authority", "previous_day_cases", "new_cases", "cases"]
        self.df = self.df[1:]
        logger.info("cases:\n", self.df)
예제 #5
0
def download_pdf(a_text):
    """
    a_text can be "Recentste epidemiologische update" or "Wekelijks epidemiologisch bulletin"

    check the webpage https://covid-19.sciensano.be/nl/covid-19-epidemiologische-situatie for more updated versions of texts of the two buttons.
    """
    try:
        req_page = _get_response(REPORT_URL)
    except Exception as e:
        raise Exception(f"Could not get web page content: {e}")

    # "Meest recent epidemiologische update"
    report_doc = html.document_fromstring(req_page.content.decode("utf-8"))
    pdf_el = report_doc.xpath(f'.//a[contains(., "{a_text}")]/@href')
    pdfs = pdf_el

    if not pdfs:
        raise Exception("No link to pdf found!")

    try:
        os.makedirs(DOCS_DAILY_FOLDER)
    except FileExistsError as e:
        logger.info(
            f"{DOCS_DAILY_FOLDER} already exists, no need to create folder")
        pass

    for pdf in pdfs:
        if not pdf.startswith("http"):
            pdf = "https://covid-19.sciensano.be" + pdf
        pdf_url = pdf
        pdf_dt = datetime.date.today().isoformat()

        pdf_url_get = requests.get(pdf_url)
        pdf_name = pdf.split("/")[-1]

        with open(os.path.join(DOCS_DAILY_FOLDER, f"{pdf_name}"), 'wb') as f:
            f.write(pdf_url_get.content)
예제 #6
0
import requests

from utils import get_response as _get_response

logging.basicConfig()
logger = logging.getLogger("covid-eu-data.download.dk")

# REPORT_URL = "https://www.ssi.dk/aktuelt/sygdomsudbrud/coronavirus/covid-19-i-danmark-epidemiologisk-overvaagningsrapport"
REPORT_URL = "https://www.ssi.dk/sygdomme-beredskab-og-forskning/sygdomsovervaagning/c/covid19-overvaagning"

DAILY_FOLDER = os.path.join("documents", "daily", "dk")

if __name__ == "__main__":

    try:
        req_page = _get_response(REPORT_URL)
    except Exception as e:
        raise Exception(f"Could not get web page content: {e}")

    # https://files.ssi.dk/COVID19-overvaagningsrapport-22032020
    re_pdf = re.compile(r'href="(https://files.ssi.dk/COVID19-.*?)"')
    pdfs = list(set(re_pdf.findall(req_page.content.decode("utf-8"))))

    if not pdfs:
        raise Exception("No link to pdf found!")

    try:
        os.makedirs(DAILY_FOLDER)
    except FileExistsError as e:
        logger.info(f"{DAILY_FOLDER} already exists, no need to create folder")
        pass
예제 #7
0
DAILY_FOLDER = os.path.join("documents", "daily", "dk")
CACHE_DAILY_FOLDER = os.path.join("cache", "daily", "dk")
os.makedirs(CACHE_DAILY_FOLDER, exist_ok=True)


def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)


if __name__ == "__main__":

    try:
        req_page = _get_response(DATA_PAGE_URL)
    except Exception as e:
        raise Exception(f"Could not get web page content: {e}")

    # https://files.ssi.dk/covid19/overvagning/data/data-epidemiologiske-rapport-12112020-ql82
    re_zip = re.compile(
        r'href="(https://files.ssi.dk/covid19/overvagning/data/data-epidemiologiske-rapport.*?)"'
    )

    zip_paths = list(set(re_zip.findall(req_page.content.decode("utf-8"))))

    if not zip_paths:
        raise Exception("No link to pdf found!")

    re_date = re.compile(r'rapport-(.*?)-')