def fix_urls(el: Element, base_url: str, broken_urls: List[str],
             urls_to_change: dict) -> Tuple[Callable, List[str]]:
    """
    Given an HTML element, turns all ``href`` parameters of ``a`` elements
    inside it into fully-qualified absolute URLs instead of the relative paths
    that are common in the tips content.

    :arg Element el: ``lxml.html.Element`` object, the content to change.
    :arg str base_url: The URL for the page, which serves as the absolute
        point with which to calculate the absolute paths.
    :arg list broken_urls: The list of broken URLs to add to as we find them.
    :arg dict[str, str] urls_to_change: Known broken URLs and their
        replacements.

    :rtype: tuple[Element, list]
    :returns: The Element with its ``a`` elements altered, and the list of
        broken URLs.
    """
    tested_urls = []  # type: List[str]
    for desc in el.iterdescendants():
        if desc.tag == "a" and "href" in desc.attrib:
            fixed_url, tested_urls, broken_urls = fix_url(
                base_url, desc.attrib["href"], tested_urls, broken_urls,
                urls_to_change)
            desc.attrib["href"] = fixed_url
    return (el, broken_urls)
Пример #2
0
def inner_html(node: Element) -> str:
    return ''.join([
        etree.tostring(child, encoding="utf-8").decode("utf-8")
        for child in node.iterdescendants()
    ])