Python HTMLParser.strip_tags 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: selectolax.parser

클래스/타입: HTMLParser

메소드/함수: strip_tags

hotexamples.com에서의 예제들: 5

Python HTMLParser.strip_tags - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 selectolax.parser.HTMLParser.strip_tags에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HTMLParser(30)

css(30)

css_first(30)

tags(16)

strip_tags(5)

text(4)

split(2)

decompose(1)

unwrap_tags(1)

예제 #1

파일 보기

파일: test_nodes.py 프로젝트: pappakrishnan/selectolax

def test_strip_tags():
    html = "<body><div></div><script></script></body>"
    html_parser = HTMLParser(html)
    html_parser.strip_tags(['div', 'script'])
    assert html_parser.html == '<html><head></head><body></body></html>'

    with pytest.raises(TypeError):
        html_parser.strip_tags(1)

예제 #2

파일 보기

파일: word.py 프로젝트: blindpandas/bookworm

 def make_proper_html(cls, html_string, docx_file_path):
     parsed = HTMLParser(html_string)
     parsed.body.unwrap_tags(TAGS_TO_UNWRAP)
     parsed.strip_tags(TAGS_TO_REMOVE)
     html_string = parsed.body.html
     docx = DocxDocumentReader(docx_file_path)
     props = docx.core_properties
     doc_title = props.title.strip()
     if not doc_title or doc_title.lower() == "word document":
         doc_title = docx_file_path.stem.strip()
     doc_author = escape_html(props.author or "")
     return NEWLINE.join([
         "<!DOCTYPE html>",
         "<html>",
         "<head>",
         '<meta charset="utf-8"/>',
         f'<meta name="author" content="{doc_author}"/>',
         f"<title>{escape_html(doc_title)}</title>",
         "</head>",
         html_string,
         "</html>",
     ])

예제 #3

파일 보기

def extract_html_features(html):
    """Process HTML document and get key features as text. Steps:
    kill all script and style elements
    get lowercase text
    remove all punctuation
    break into lines and remove leading and trailing space on each
    break multi-headlines into a line each
    drop blank lines
    return a dict with features and their weights
    """
    try:
        tree = HTMLParser(html)
        tree.strip_tags(['script', 'style'])
        text = tree.root.text(separator=' ')
        if not text:
            return {}
    except UnicodeDecodeError:
        return {}
    text = text.lower().translate(TRANSLATOR)
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return {k: sum(1 for _ in g) for k, g in groupby(sorted(text.split()))}

예제 #4

파일 보기

def parse_html(json_entry):
    path = json_entry['path']
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        html = f.read()

    tree = HTMLParser(html)

    if tree.body is None:
        return None

    scraped_src = False
    source_url = json_entry.get('source', '')

    # Look for canonical tag and set to source URL
    if source_url == '':
        scraped_src = True
        for node in tree.css('link'):
            if 'rel' in node.attributes and node.attributes[
                    'rel'] == 'canonical':
                source_url = node.attributes['href']
                break

    # Find all multimedia filepaths in all tags that could possible contain such paths
    # using a CSS selector.
    media_lst = []
    media_tags_selector = 'img,video,audio,object,embed,source'
    for node in tree.css(media_tags_selector):
        tag = node.tag
        if tag == 'img':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'video':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
            if 'poster' in node.attributes and node.attributes['poster']:
                media_lst.append(node.attributes['poster'])
        elif tag == 'audio':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'object':
            if 'data' in node.attributes and node.attributes['data']:
                media_lst.append(node.attributes['data'])
        elif tag == 'embed':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
        elif tag == 'source':
            if 'src' in node.attributes and node.attributes['src']:
                media_lst.append(node.attributes['src'])
            if 'srcset' in node.attributes and node.attributes['srcset']:
                media_lst.append(node.attributes['srcset'])

    # These tags will never contain visible text.
    ignore_tags = [
        'img', 'video', 'audio', 'object', 'embed', 'source', 'script',
        'style', 'head', 'meta', '[document]'
    ]
    tree.strip_tags(ignore_tags)
    text = tree.body.text()

    if scraped_src:
        update_src_dict = {}
        update_src_dict['id'] = json_entry['id']
        update_src_dict['source'] = source_url
        update_src_dict['icon'] = json_entry['icon']
        update_src_dict['icon'] = find_icon(update_src_dict)

        return json_entry, text, media_lst, update_src_dict

    else:
        return json_entry, text, media_lst, None

예제 #5

파일 보기

 def text(self) -> str:
     new_tree = HTMLParser(self._html_input)
     tags = ["head", "script", "noscript", "style", "iframe", "noembed", "noframes"]
     new_tree.strip_tags(tags)
     return new_tree.text(separator=" ").strip()