def _inline_css(style_tag: PageElement, style_file: Path) -> bool: """ replacement callable to replace stylesheets for inline_data """ style_content = NavigableString(style_file.read_text()) new_style_tag = BeautifulSoup(features="html.parser").new_tag("style") new_style_tag.insert(0, style_content) new_style_tag["type"] = "text/css" style_tag.replaceWith(new_style_tag)
def _inline_script(script_tag: PageElement, script_file: Path) -> bool: """ replacement callable to replace scripts for inline_data """ script_content = NavigableString(script_file.read_text()) new_script_tag = BeautifulSoup(features="html.parser").new_tag("script") new_script_tag.insert(0, script_content) new_script_tag["type"] = "text/javascript" script_tag.replaceWith(new_script_tag)
def visit_and_hyphenate( node: bs4.PageElement) -> Optional[List[bs4.PageElement]]: """Visits HTML nodes and hyphenates text. Returns: Children of tag elements that should be further processed, e.g., <pre> elements are skipped. """ if isinstance(node, bs4.Comment): return None # We check whether `Stylesheet` is implemented, because it's a # relatively recent addition to BeautifulSoup # (https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/revision/564). # In case it is not, we don't skip <style> nodes. This will mangle # stylesheets if they exist, but that is a cost I'm willing to take. if (is_stylesheet_implemented() and isinstance(node, bs4.element.Stylesheet)): return None if isinstance(node, bs4.Tag): if node.name == 'pre': return None if node.name == 'style': return None return node.children if not isinstance(node, bs4.NavigableString): return None # My intention is to remove silent-hyphens, so that language detection # works correctly. printable_text = only_printable(node) if should_ignore(printable_text): return None try: lang = langdetect.detect(printable_text) if lang == 'en': # Use US dictionary for English, because it seems that the US # dictionary is richer. For example en_GB doesn't hyphenate # "format," but US does ("for-mat"). lang = 'en_US' dic = pyphen.Pyphen(lang=lang) except (langdetect.lang_detect_exception.LangDetectException, KeyError): return None new_text = hyphenate_end_node(dic, node) node.replaceWith(new_text) return None