def _download_open_data( logger: ErrorLogger, url_tpl: str, output_folder: Path, ibge_code: str, max_volumes: int = 12, **download_opts, ) -> Dict[str, str]: logger.log_debug(f"Downloading Brazil data for {ibge_code}...") # Since we are guessing the URL, we forgive errors in the download output = {} download_opts = dict(download_opts, ignore_failure=True) map_func = partial(download_snapshot, output_folder=output_folder, **download_opts) map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)] for idx, file_path in enumerate(thread_map(map_func, map_iter)): if file_path is not None: output[f"{ibge_code}-{idx + 1}"] = file_path # Filter out empty files, which can happen if download fails in an unexpected way output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0} # If the output is not split into volumes, fall back to single file URL if output: return output else: url = url_tpl.format(ibge_code) return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # The link to the spreadsheet changes daily, so we parse the HTML to find the link every # time and download the latest version buffer = BytesIO() src_opts = fetch_opts[0] download(src_opts["url"], buffer) page = BeautifulSoup(buffer.getvalue().decode("utf8"), "lxml") for link in page.findAll("a"): if "href" in link.attrs and link.attrs.get("href").endswith( "xlsx"): href = link.attrs.get("href") if href.startswith("/"): href = "https://" + src_opts["url"].split("//")[1].split( "/")[0] + href return [ download_snapshot(href, output_folder, **src_opts.get("opts")) ] raise RuntimeError("No link to XLSX file found in page")
def fetch(self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]]) -> Dict[str, str]: # The source URL is a template which we must format for the requested state parse_opts = self.config["parse"] code = parse_opts["subregion1_code"].lower() # Some datasets are split into "volumes" so we try to guess the URL base_opts = dict(fetch_opts[0]) url_tpl = base_opts.pop("url") fetch_opts = [{ "url": url_tpl.format(f"{code}-{idx}"), **base_opts } for idx in range(1, 10)] # Since we are guessing the URL, we forgive errors in the download output = {} for idx, source_config in enumerate(fetch_opts): url = source_config["url"] name = source_config.get("name", idx) download_opts = source_config.get("opts", {}) download_opts["progress"] = True try: self.log_debug(f"Downloading {url}...") output[name] = download_snapshot(url, output_folder, **download_opts) except: self.log_warning(f"Failed to download URL {url}") break # If the output is not split into volumes, fall back to single file URL if output: return output else: fetch_opts = [{"url": url_tpl.format(code), **base_opts}] return super().fetch(output_folder, cache, fetch_opts)
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: output = {} curr_idx = 1 url_tpl = fetch_opts[0].get("url") download_options = dict(fetch_opts[0].get("opts", {}), skip_existing=skip_existing) while True: try: url = url_tpl.format(idx=curr_idx) fname = download_snapshot(url, output_folder, **download_options) output.update({curr_idx: fname}) curr_idx += 1 except requests.HTTPError: break assert len(output) > 0, "No data downloaded" return output
def fetch(self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]]) -> Dict[str, str]: geo_url = f"{URL_OUTPUTS_PROD}/geography.csv" download_opts = (fetch_opts or [{}])[0].get("opts", {}) return {0: download_snapshot(geo_url, output_folder, **download_opts)}