def test_strip_tags(): html = "<body><div></div><script></script></body>" html_parser = HTMLParser(html) html_parser.strip_tags(['div', 'script']) assert html_parser.html == '<html><head></head><body></body></html>' with pytest.raises(TypeError): html_parser.strip_tags(1)
def make_proper_html(cls, html_string, docx_file_path): parsed = HTMLParser(html_string) parsed.body.unwrap_tags(TAGS_TO_UNWRAP) parsed.strip_tags(TAGS_TO_REMOVE) html_string = parsed.body.html docx = DocxDocumentReader(docx_file_path) props = docx.core_properties doc_title = props.title.strip() if not doc_title or doc_title.lower() == "word document": doc_title = docx_file_path.stem.strip() doc_author = escape_html(props.author or "") return NEWLINE.join([ "<!DOCTYPE html>", "<html>", "<head>", '<meta charset="utf-8"/>', f'<meta name="author" content="{doc_author}"/>', f"<title>{escape_html(doc_title)}</title>", "</head>", html_string, "</html>", ])
def extract_html_features(html): """Process HTML document and get key features as text. Steps: kill all script and style elements get lowercase text remove all punctuation break into lines and remove leading and trailing space on each break multi-headlines into a line each drop blank lines return a dict with features and their weights """ try: tree = HTMLParser(html) tree.strip_tags(['script', 'style']) text = tree.root.text(separator=' ') if not text: return {} except UnicodeDecodeError: return {} text = text.lower().translate(TRANSLATOR) lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) return {k: sum(1 for _ in g) for k, g in groupby(sorted(text.split()))}
def parse_html(json_entry): path = json_entry['path'] with open(path, 'r', encoding='utf8', errors='ignore') as f: html = f.read() tree = HTMLParser(html) if tree.body is None: return None scraped_src = False source_url = json_entry.get('source', '') # Look for canonical tag and set to source URL if source_url == '': scraped_src = True for node in tree.css('link'): if 'rel' in node.attributes and node.attributes[ 'rel'] == 'canonical': source_url = node.attributes['href'] break # Find all multimedia filepaths in all tags that could possible contain such paths # using a CSS selector. media_lst = [] media_tags_selector = 'img,video,audio,object,embed,source' for node in tree.css(media_tags_selector): tag = node.tag if tag == 'img': if 'src' in node.attributes and node.attributes['src']: media_lst.append(node.attributes['src']) elif tag == 'video': if 'src' in node.attributes and node.attributes['src']: media_lst.append(node.attributes['src']) if 'poster' in node.attributes and node.attributes['poster']: media_lst.append(node.attributes['poster']) elif tag == 'audio': if 'src' in node.attributes and node.attributes['src']: media_lst.append(node.attributes['src']) elif tag == 'object': if 'data' in node.attributes and node.attributes['data']: media_lst.append(node.attributes['data']) elif tag == 'embed': if 'src' in node.attributes and node.attributes['src']: media_lst.append(node.attributes['src']) elif tag == 'source': if 'src' in node.attributes and node.attributes['src']: media_lst.append(node.attributes['src']) if 'srcset' in node.attributes and node.attributes['srcset']: media_lst.append(node.attributes['srcset']) # These tags will never contain visible text. ignore_tags = [ 'img', 'video', 'audio', 'object', 'embed', 'source', 'script', 'style', 'head', 'meta', '[document]' ] tree.strip_tags(ignore_tags) text = tree.body.text() if scraped_src: update_src_dict = {} update_src_dict['id'] = json_entry['id'] update_src_dict['source'] = source_url update_src_dict['icon'] = json_entry['icon'] update_src_dict['icon'] = find_icon(update_src_dict) return json_entry, text, media_lst, update_src_dict else: return json_entry, text, media_lst, None
def text(self) -> str: new_tree = HTMLParser(self._html_input) tags = ["head", "script", "noscript", "style", "iframe", "noembed", "noframes"] new_tree.strip_tags(tags) return new_tree.text(separator=" ").strip()