示例#1
0
    def iterate_folders(self, html_doc: str, filter_category: str = "") -> Generator[str, None, None]:
        def link_to_folder(link: bs4.element.Tag) -> str:
            raw_url: str = link.get("href", default="")
            url: ParseResult = urlparse(raw_url)
            if url.scheme or url.netloc:
                return ""
            url_path: str = posixpath.normpath(url.path)
            if "/" in url_path or url_path == "." or url_path == "..":
                return ""
            return url_path

        try:
            soup: bs4.BeautifulSoup = bs4.BeautifulSoup(html_doc, "html.parser")
            for link in soup.find_all("a"):
                folder: str = link_to_folder(link)
                if not folder:
                    continue
                if folder.startswith(filter_category):
                    yield folder
        except Exception as e:
            url = posixpath.join(Settings.baseurl, self.archive_id.to_url())
            raise ArchiveConnectionError(
                f"Failed to retrieve the expected HTML page at {url}",
                suggested_action=[
                    "Check your network connection.",
                    f"Make sure that you can access {url} in your web browser.",
                ],
            ) from e
示例#2
0
def getUrl(url: str, timeout) -> str:
    logger = getLogger("aqt.helper")
    with requests.Session() as session:
        retries = requests.adapters.Retry(
            total=Settings.max_retries_on_connection_error,
            backoff_factor=Settings.backoff_factor)
        adapter = requests.adapters.HTTPAdapter(max_retries=retries)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        try:
            r = requests.get(url, allow_redirects=False, timeout=timeout)
            num_redirects = 0
            while 300 < r.status_code < 309 and num_redirects < 10:
                num_redirects += 1
                logger.debug("Asked to redirect({}) to: {}".format(
                    r.status_code, r.headers["Location"]))
                newurl = altlink(r.url, r.headers["Location"])
                logger.info("Redirected: {}".format(urlparse(newurl).hostname))
                r = session.get(newurl, stream=True, timeout=timeout)
        except (
                ConnectionResetError,
                requests.exceptions.ConnectionError,
                requests.exceptions.Timeout,
        ) as e:
            raise ArchiveConnectionError(
                f"Failure to connect to {url}: {type(e).__name__}") from e
        else:
            if r.status_code != 200:
                msg = f"Failed to retrieve file at {url}\nServer response code: {r.status_code}, reason: {r.reason}"
                raise ArchiveDownloadError(msg)
        result = r.text
    return result
示例#3
0
def downloadBinaryFile(url: str, out: str, hash_algo: str, exp: bytes,
                       timeout):
    logger = getLogger("aqt.helper")
    filename = Path(url).name
    with requests.Session() as session:
        retries = requests.adapters.Retry(
            total=Settings.max_retries_on_connection_error,
            backoff_factor=Settings.backoff_factor)
        adapter = requests.adapters.HTTPAdapter(max_retries=retries)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        try:
            r = session.get(url,
                            allow_redirects=False,
                            stream=True,
                            timeout=timeout)
            if 300 < r.status_code < 309:
                logger.debug("Asked to redirect({}) to: {}".format(
                    r.status_code, r.headers["Location"]))
                newurl = altlink(r.url, r.headers["Location"])
                logger.info("Redirected: {}".format(urlparse(newurl).hostname))
                r = session.get(newurl, stream=True, timeout=timeout)
        except requests.exceptions.ConnectionError as e:
            raise ArchiveConnectionError(f"Connection error: {e.args}") from e
        except requests.exceptions.Timeout as e:
            raise ArchiveConnectionError(
                f"Connection timeout: {e.args}") from e
        else:
            hash = hashlib.new(hash_algo)
            try:
                with open(out, "wb") as fd:
                    for chunk in r.iter_content(chunk_size=8196):
                        fd.write(chunk)
                        hash.update(chunk)
                    fd.flush()
            except Exception as e:
                raise ArchiveDownloadError(
                    f"Download of {filename} has error: {e}") from e
            if exp is not None and hash.digest() != exp:
                raise ArchiveChecksumError(
                    f"Downloaded file {filename} is corrupted! Detect checksum error.\n"
                    f"Expect {exp.hex()}: {url}\n"
                    f"Actual {hash.digest().hex()}: {out}")
示例#4
0
    def mock_getUrl(url, *args, **kwargs):
        nonlocal num_tries
        num_tries += 1
        if num_tries < num_tries_required:
            raise ArchiveConnectionError(f"Must retry {num_tries_required - num_tries} more times before success")
        parsed = urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}"
        assert base in Settings.trusted_mirrors
        # Check that the url was composed properly
        assert url[len(base) :] == f"/{rest_of_url}.sha256"

        hash_filename = str(parsed.path.split("/")[-1])
        assert hash_filename == "archive.7z.sha256"
        return f"{expected_hash} archive.7z"
示例#5
0
def getUrl(url: str, timeout, expected_hash: Optional[bytes] = None) -> str:
    """
    Gets a file from `url` via HTTP GET.

    No caller should call this function without providing an expected_hash, unless
    the caller is `get_hash`, which cannot know what the expected hash should be.
    """
    logger = getLogger("aqt.helper")
    with requests.sessions.Session() as session:
        retries = requests.adapters.Retry(
            total=Settings.max_retries_on_connection_error,
            backoff_factor=Settings.backoff_factor)
        adapter = requests.adapters.HTTPAdapter(max_retries=retries)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        try:
            r = session.get(url, allow_redirects=False, timeout=timeout)
            num_redirects = 0
            while 300 < r.status_code < 309 and num_redirects < 10:
                num_redirects += 1
                logger.debug("Asked to redirect({}) to: {}".format(
                    r.status_code, r.headers["Location"]))
                newurl = altlink(r.url, r.headers["Location"])
                logger.info("Redirected: {}".format(urlparse(newurl).hostname))
                r = session.get(newurl, stream=True, timeout=timeout)
        except (
                ConnectionResetError,
                requests.exceptions.ConnectionError,
                requests.exceptions.Timeout,
        ) as e:
            raise ArchiveConnectionError(
                f"Failure to connect to {url}: {type(e).__name__}") from e
        else:
            if r.status_code != 200:
                msg = f"Failed to retrieve file at {url}\nServer response code: {r.status_code}, reason: {r.reason}"
                raise ArchiveDownloadError(msg)
        result = r.text
        filename = url.split("/")[-1]
        actual_hash = hashlib.sha256(bytes(result, "utf-8")).digest()
        if expected_hash is not None and expected_hash != actual_hash:
            raise ArchiveChecksumError(
                f"Downloaded file {filename} is corrupted! Detect checksum error.\n"
                f"Expect {expected_hash.hex()}: {url}\n"
                f"Actual {actual_hash.hex()}: {filename}")
    return result
示例#6
0
 def _mock(url, **kwargs):
     urls_requested.add(url)
     raise ArchiveConnectionError()