Пример #1
0
def _inline_css(style_tag: PageElement, style_file: Path) -> bool:
    """ replacement callable to replace stylesheets for inline_data """

    style_content = NavigableString(style_file.read_text())

    new_style_tag = BeautifulSoup(features="html.parser").new_tag("style")
    new_style_tag.insert(0, style_content)
    new_style_tag["type"] = "text/css"

    style_tag.replaceWith(new_style_tag)
Пример #2
0
def _inline_script(script_tag: PageElement, script_file: Path) -> bool:
    """ replacement callable to replace scripts for inline_data """

    script_content = NavigableString(script_file.read_text())

    new_script_tag = BeautifulSoup(features="html.parser").new_tag("script")
    new_script_tag.insert(0, script_content)
    new_script_tag["type"] = "text/javascript"

    script_tag.replaceWith(new_script_tag)
Пример #3
0
def visit_and_hyphenate(
        node: bs4.PageElement) -> Optional[List[bs4.PageElement]]:
    """Visits HTML nodes and hyphenates text.

    Returns:
        Children of tag elements that should be further processed, e.g., <pre>
        elements are skipped.
    """
    if isinstance(node, bs4.Comment):
        return None

    # We check whether `Stylesheet` is implemented, because it's a
    # relatively recent addition to BeautifulSoup
    # (https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/revision/564).
    # In case it is not, we don't skip <style> nodes. This will mangle
    # stylesheets if they exist, but that is a cost I'm willing to take.
    if (is_stylesheet_implemented()
            and isinstance(node, bs4.element.Stylesheet)):
        return None

    if isinstance(node, bs4.Tag):
        if node.name == 'pre':
            return None
        if node.name == 'style':
            return None
        return node.children

    if not isinstance(node, bs4.NavigableString):
        return None

    # My intention is to remove silent-hyphens, so that language detection
    # works correctly.
    printable_text = only_printable(node)
    if should_ignore(printable_text):
        return None

    try:
        lang = langdetect.detect(printable_text)
        if lang == 'en':
            # Use US dictionary for English, because it seems that the US
            # dictionary is richer. For example en_GB doesn't hyphenate
            # "format," but US does ("for-mat").
            lang = 'en_US'
        dic = pyphen.Pyphen(lang=lang)
    except (langdetect.lang_detect_exception.LangDetectException, KeyError):
        return None

    new_text = hyphenate_end_node(dic, node)
    node.replaceWith(new_text)
    return None