Пример #1
0
def _download_open_data(
    logger: ErrorLogger,
    url_tpl: str,
    output_folder: Path,
    ibge_code: str,
    max_volumes: int = 12,
    **download_opts,
) -> Dict[str, str]:
    logger.log_debug(f"Downloading Brazil data for {ibge_code}...")

    # Since we are guessing the URL, we forgive errors in the download
    output = {}
    download_opts = dict(download_opts, ignore_failure=True)
    map_func = partial(download_snapshot, output_folder=output_folder, **download_opts)
    map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)]
    for idx, file_path in enumerate(thread_map(map_func, map_iter)):
        if file_path is not None:
            output[f"{ibge_code}-{idx + 1}"] = file_path

    # Filter out empty files, which can happen if download fails in an unexpected way
    output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0}

    # If the output is not split into volumes, fall back to single file URL
    if output:
        return output
    else:
        url = url_tpl.format(ibge_code)
        return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
 def fetch(
     self,
     output_folder: Path,
     cache: Dict[str, str],
     fetch_opts: List[Dict[str, Any]],
     skip_existing: bool = False,
 ) -> Dict[str, str]:
     # The link to the spreadsheet changes daily, so we parse the HTML to find the link every
     # time and download the latest version
     buffer = BytesIO()
     src_opts = fetch_opts[0]
     download(src_opts["url"], buffer)
     page = BeautifulSoup(buffer.getvalue().decode("utf8"), "lxml")
     for link in page.findAll("a"):
         if "href" in link.attrs and link.attrs.get("href").endswith(
                 "xlsx"):
             href = link.attrs.get("href")
             if href.startswith("/"):
                 href = "https://" + src_opts["url"].split("//")[1].split(
                     "/")[0] + href
             return [
                 download_snapshot(href, output_folder,
                                   **src_opts.get("opts"))
             ]
     raise RuntimeError("No link to XLSX file found in page")
Пример #3
0
    def fetch(self, output_folder: Path, cache: Dict[str, str],
              fetch_opts: List[Dict[str, Any]]) -> Dict[str, str]:
        # The source URL is a template which we must format for the requested state
        parse_opts = self.config["parse"]
        code = parse_opts["subregion1_code"].lower()

        # Some datasets are split into "volumes" so we try to guess the URL
        base_opts = dict(fetch_opts[0])
        url_tpl = base_opts.pop("url")
        fetch_opts = [{
            "url": url_tpl.format(f"{code}-{idx}"),
            **base_opts
        } for idx in range(1, 10)]

        # Since we are guessing the URL, we forgive errors in the download
        output = {}
        for idx, source_config in enumerate(fetch_opts):
            url = source_config["url"]
            name = source_config.get("name", idx)
            download_opts = source_config.get("opts", {})
            download_opts["progress"] = True
            try:
                self.log_debug(f"Downloading {url}...")
                output[name] = download_snapshot(url, output_folder,
                                                 **download_opts)
            except:
                self.log_warning(f"Failed to download URL {url}")
                break

        # If the output is not split into volumes, fall back to single file URL
        if output:
            return output
        else:
            fetch_opts = [{"url": url_tpl.format(code), **base_opts}]
            return super().fetch(output_folder, cache, fetch_opts)
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        output = {}
        curr_idx = 1
        url_tpl = fetch_opts[0].get("url")
        download_options = dict(fetch_opts[0].get("opts", {}),
                                skip_existing=skip_existing)
        while True:
            try:
                url = url_tpl.format(idx=curr_idx)
                fname = download_snapshot(url, output_folder,
                                          **download_options)
                output.update({curr_idx: fname})
                curr_idx += 1
            except requests.HTTPError:
                break

        assert len(output) > 0, "No data downloaded"
        return output
Пример #5
0
 def fetch(self, output_folder: Path, cache: Dict[str, str],
           fetch_opts: List[Dict[str, Any]]) -> Dict[str, str]:
     geo_url = f"{URL_OUTPUTS_PROD}/geography.csv"
     download_opts = (fetch_opts or [{}])[0].get("opts", {})
     return {0: download_snapshot(geo_url, output_folder, **download_opts)}