示例#1
0
def test_strip_tags():
    html = "<body><div></div><script></script></body>"
    html_parser = HTMLParser(html)
    html_parser.strip_tags(['div', 'script'])
    assert html_parser.html == '<html><head></head><body></body></html>'

    with pytest.raises(TypeError):
        html_parser.strip_tags(1)
示例#2
0
 def make_proper_html(cls, html_string, docx_file_path):
     parsed = HTMLParser(html_string)
     parsed.body.unwrap_tags(TAGS_TO_UNWRAP)
     parsed.strip_tags(TAGS_TO_REMOVE)
     html_string = parsed.body.html
     docx = DocxDocumentReader(docx_file_path)
     props = docx.core_properties
     doc_title = props.title.strip()
     if not doc_title or doc_title.lower() == "word document":
         doc_title = docx_file_path.stem.strip()
     doc_author = escape_html(props.author or "")
     return NEWLINE.join([
         "<!DOCTYPE html>",
         "<html>",
         "<head>",
         '<meta charset="utf-8"/>',
         f'<meta name="author" content="{doc_author}"/>',
         f"<title>{escape_html(doc_title)}</title>",
         "</head>",
         html_string,
         "</html>",
     ])
示例#3
0
def extract_html_features(html):
    """Process HTML document and get key features as text. Steps:
    kill all script and style elements
    get lowercase text
    remove all punctuation
    break into lines and remove leading and trailing space on each
    break multi-headlines into a line each
    drop blank lines
    return a dict with features and their weights
    """
    try:
        tree = HTMLParser(html)
        tree.strip_tags(['script', 'style'])
        text = tree.root.text(separator=' ')
        if not text:
            return {}
    except UnicodeDecodeError:
        return {}
    text = text.lower().translate(TRANSLATOR)
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return {k: sum(1 for _ in g) for k, g in groupby(sorted(text.split()))}
示例#4
0
def parse_html(json_entry):
    path = json_entry['path']
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        html = f.read()

    tree = HTMLParser(html)

    if tree.body is None:
        return None

    scraped_src = False
    source_url = json_entry.get('source', '')

    # Look for canonical tag and set to source URL
    if source_url == '':
        scraped_src = True
        for node in tree.css('link'):
            if 'rel' in node.attributes and node.attributes[
                    'rel'] == 'canonical':
                source_url = node.attributes['href']
                break

    # Find all multimedia filepaths in all tags that could possible contain such paths
    # using a CSS selector.
    media_lst = []
    media_tags_selector = 'img,video,audio,object,embed,source'
    for node in tree.css(media_tags_selector):
        tag = node.tag
        if tag == 'img':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'video':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
            if 'poster' in node.attributes and node.attributes['poster']:
                media_lst.append(node.attributes['poster'])
        elif tag == 'audio':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'object':
            if 'data' in node.attributes and node.attributes['data']:
                media_lst.append(node.attributes['data'])
        elif tag == 'embed':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'source':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
            if 'srcset' in node.attributes and node.attributes['srcset']:
                media_lst.append(node.attributes['srcset'])

    # These tags will never contain visible text.
    ignore_tags = [
        'img', 'video', 'audio', 'object', 'embed', 'source', 'script',
        'style', 'head', 'meta', '[document]'
    ]
    tree.strip_tags(ignore_tags)
    text = tree.body.text()

    if scraped_src:
        update_src_dict = {}
        update_src_dict['id'] = json_entry['id']
        update_src_dict['source'] = source_url
        update_src_dict['icon'] = json_entry['icon']
        update_src_dict['icon'] = find_icon(update_src_dict)

        return json_entry, text, media_lst, update_src_dict

    else:
        return json_entry, text, media_lst, None
示例#5
0
 def text(self) -> str:
     new_tree = HTMLParser(self._html_input)
     tags = ["head", "script", "noscript", "style", "iframe", "noembed", "noframes"]
     new_tree.strip_tags(tags)
     return new_tree.text(separator=" ").strip()