def iterate_folders(self, html_doc: str, filter_category: str = "") -> Generator[str, None, None]: def link_to_folder(link: bs4.element.Tag) -> str: raw_url: str = link.get("href", default="") url: ParseResult = urlparse(raw_url) if url.scheme or url.netloc: return "" url_path: str = posixpath.normpath(url.path) if "/" in url_path or url_path == "." or url_path == "..": return "" return url_path try: soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser") for link in soup.find_all("a"): folder: str = link_to_folder(link) if not folder: continue if folder.startswith(filter_category): yield folder except Exception as e: url = posixpath.join(Settings.baseurl, self.archive_id.to_url()) raise ArchiveConnectionError( f"Failed to retrieve the expected HTML page at {url}", suggested_action=[ "Check your network connection.", f"Make sure that you can access {url} in your web browser.", ], ) from e
def getUrl(url: str, timeout) -> str: logger = getLogger("aqt.helper") with requests.Session() as session: retries = requests.adapters.Retry( total=Settings.max_retries_on_connection_error, backoff_factor=Settings.backoff_factor) adapter = requests.adapters.HTTPAdapter(max_retries=retries) session.mount("http://", adapter) session.mount("https://", adapter) try: r = requests.get(url, allow_redirects=False, timeout=timeout) num_redirects = 0 while 300 < r.status_code < 309 and num_redirects < 10: num_redirects += 1 logger.debug("Asked to redirect({}) to: {}".format( r.status_code, r.headers["Location"])) newurl = altlink(r.url, r.headers["Location"]) logger.info("Redirected: {}".format(urlparse(newurl).hostname)) r = session.get(newurl, stream=True, timeout=timeout) except ( ConnectionResetError, requests.exceptions.ConnectionError, requests.exceptions.Timeout, ) as e: raise ArchiveConnectionError( f"Failure to connect to {url}: {type(e).__name__}") from e else: if r.status_code != 200: msg = f"Failed to retrieve file at {url}\nServer response code: {r.status_code}, reason: {r.reason}" raise ArchiveDownloadError(msg) result = r.text return result
def downloadBinaryFile(url: str, out: str, hash_algo: str, exp: bytes, timeout): logger = getLogger("aqt.helper") filename = Path(url).name with requests.Session() as session: retries = requests.adapters.Retry( total=Settings.max_retries_on_connection_error, backoff_factor=Settings.backoff_factor) adapter = requests.adapters.HTTPAdapter(max_retries=retries) session.mount("http://", adapter) session.mount("https://", adapter) try: r = session.get(url, allow_redirects=False, stream=True, timeout=timeout) if 300 < r.status_code < 309: logger.debug("Asked to redirect({}) to: {}".format( r.status_code, r.headers["Location"])) newurl = altlink(r.url, r.headers["Location"]) logger.info("Redirected: {}".format(urlparse(newurl).hostname)) r = session.get(newurl, stream=True, timeout=timeout) except requests.exceptions.ConnectionError as e: raise ArchiveConnectionError(f"Connection error: {e.args}") from e except requests.exceptions.Timeout as e: raise ArchiveConnectionError( f"Connection timeout: {e.args}") from e else: hash = hashlib.new(hash_algo) try: with open(out, "wb") as fd: for chunk in r.iter_content(chunk_size=8196): fd.write(chunk) hash.update(chunk) fd.flush() except Exception as e: raise ArchiveDownloadError( f"Download of {filename} has error: {e}") from e if exp is not None and hash.digest() != exp: raise ArchiveChecksumError( f"Downloaded file {filename} is corrupted! Detect checksum error.\n" f"Expect {exp.hex()}: {url}\n" f"Actual {hash.digest().hex()}: {out}")
def mock_getUrl(url, *args, **kwargs): nonlocal num_tries num_tries += 1 if num_tries < num_tries_required: raise ArchiveConnectionError(f"Must retry {num_tries_required - num_tries} more times before success") parsed = urlparse(url) base = f"{parsed.scheme}://{parsed.netloc}" assert base in Settings.trusted_mirrors # Check that the url was composed properly assert url[len(base) :] == f"/{rest_of_url}.sha256" hash_filename = str(parsed.path.split("/")[-1]) assert hash_filename == "archive.7z.sha256" return f"{expected_hash} archive.7z"
def getUrl(url: str, timeout, expected_hash: Optional[bytes] = None) -> str: """ Gets a file from `url` via HTTP GET. No caller should call this function without providing an expected_hash, unless the caller is `get_hash`, which cannot know what the expected hash should be. """ logger = getLogger("aqt.helper") with requests.sessions.Session() as session: retries = requests.adapters.Retry( total=Settings.max_retries_on_connection_error, backoff_factor=Settings.backoff_factor) adapter = requests.adapters.HTTPAdapter(max_retries=retries) session.mount("http://", adapter) session.mount("https://", adapter) try: r = session.get(url, allow_redirects=False, timeout=timeout) num_redirects = 0 while 300 < r.status_code < 309 and num_redirects < 10: num_redirects += 1 logger.debug("Asked to redirect({}) to: {}".format( r.status_code, r.headers["Location"])) newurl = altlink(r.url, r.headers["Location"]) logger.info("Redirected: {}".format(urlparse(newurl).hostname)) r = session.get(newurl, stream=True, timeout=timeout) except ( ConnectionResetError, requests.exceptions.ConnectionError, requests.exceptions.Timeout, ) as e: raise ArchiveConnectionError( f"Failure to connect to {url}: {type(e).__name__}") from e else: if r.status_code != 200: msg = f"Failed to retrieve file at {url}\nServer response code: {r.status_code}, reason: {r.reason}" raise ArchiveDownloadError(msg) result = r.text filename = url.split("/")[-1] actual_hash = hashlib.sha256(bytes(result, "utf-8")).digest() if expected_hash is not None and expected_hash != actual_hash: raise ArchiveChecksumError( f"Downloaded file {filename} is corrupted! Detect checksum error.\n" f"Expect {expected_hash.hex()}: {url}\n" f"Actual {actual_hash.hex()}: {filename}") return result
def _mock(url, **kwargs): urls_requested.add(url) raise ArchiveConnectionError()