Python Cleaner.xpath 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lxml.html.clean

클래스/타입: Cleaner

메소드/함수: xpath

hotexamples.com에서의 예제들: 3

Python Cleaner.xpath - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lxml.html.clean.Cleaner.xpath에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Cleaner(30)

clean_html(30)

style(30)

kill_tags(30)

javascript(30)

remove_tags(23)

scripts(21)

page_structure(19)

meta(19)

links(16)

remove_unknown_tags(15)

comments(14)

allow_tags(13)

safe_attrs_only(12)

embedded(11)

forms(11)

frames(9)

annoying_tags(8)

html(7)

processing_instructions(7)

inline_style(4)

safe_attrs(3)

xpath(2)

add_nofollow(2)

__call__(2)

allow_tag(1)

javasript(1)

remove_attributes(1)

host_whitelist(1)

replace(1)

frame(1)

embeded(1)

script(1)

allow_attributes(1)

startswith(1)

__init__(1)

whitelist_tags(1)

allow_embedded_url(1)

예제 #1

파일 보기

def get_movies(site_content):
    logger().info("Get movies")

    movies_section: str = MOVIES_POSTER
    movies: dict[str:list] = {}

    html_site_elements_content = Cleaner(style=True).clean_html(
        html.fromstring(site_content))

    for movie in html_site_elements_content.xpath(
            MOVIE_TICKETS_BLOCK):  # type: HtmlElement
        movie_block_tag = movie.tag
        if movie_block_tag == DIV_TAG:
            movies_section = movie.xpath(
                XPATH_GET_TEXT.format(xpath=MOVIES_TITLE_BLOCK))[0].strip()
            continue
        movies_poster = [] if movies.get(
            movies_section) is None else movies[movies_section]
        movies[movies_section] = movies_poster + fetch_movies(movie)

    return movies

예제 #2

파일 보기

def parse_part(env, part, msg_id, inner=False):
    content = OrderedDict([('files', []), ('attachments', []),
                           ('embedded', {}), ('html', '')])

    ctype = part.get_content_type()
    mtype = part.get_content_maintype()
    stype = part.get_content_subtype()
    if part.is_multipart():
        for m in part.get_payload():
            child = parse_part(env, m, msg_id, True)
            child_html = child.pop('html', '')
            child_text = child.pop('text', '')
            content.setdefault('html', '')
            content.setdefault('text', '')
            if stype != 'alternative':
                content['html'] += child_html
                content['text'] += child_text
            else:
                content['html'] = child_html or content['html']
                content['text'] = child_text or content['text']
            content['files'] += child.pop('files')
            content.update(child)
    elif mtype == 'multipart':
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        content['html'] = text
    elif ctype in ['text/html', 'text/plain']:
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        if ctype == 'text/html':
            content['html'] = text
        elif ctype == 'text/plain' and not content['html']:
            text = html.unescape(text)
            text = re.sub(r'<[^>]+>', '', text)
            content['text'] = text
            text = text2html(text)
            content['html'] = text
    else:
        payload = part.get_payload(decode=True)
        filename = part.get_filename()
        filename = decode_header(filename, msg_id) if filename else ctype
        attachment = {
            'mimetype': ctype,
            'id': part.get('Content-ID'),
            'filename': filename,
            'payload': payload,
            'size': len(payload) if payload else None
        }
        content['files'] += [attachment]

    content['html'] = clean_html(content['html'])
    if inner:
        return content

    content.update(attachments=[], embedded={})
    for index, item in enumerate(content['files']):
        if item['payload']:
            name = slugify(item['filename'] or item['id'])
            path = '/'.join([slugify(msg_id), str(index), name])
            asset = env.files.to_db(path, item['mimetype'], item['filename'])
            if item['id']:
                content['embedded'][item['id']] = asset
            elif item['filename']:
                content['attachments'].append(asset)
            else:
                log.warn('UnknownAttachment(%s)', msg_id)
                continue

            env.files.write(path, item['payload'])

    if content['html']:
        htm = lh.fromstring(content['html'])

        # Fix img[@src]
        embedded = dict(content['embedded'])
        for img in htm.xpath('//img[@src]'):
            src = img.attrib.get('src')

            cid = re.match('^cid:(.*)', src)
            obj = cid and embedded.pop('<%s>' % cid.group(1), None)
            if obj:
                cid = cid.group(1)
                img.attrib['src'] = env.files.url(obj['path'])
            elif not re.match('^(https?://|/|data:image/).*', src):
                del img.attrib['src']
        content['attachments'] += embedded.values()

        content['html'] = lh.tostring(htm, encoding='utf-8').decode()
        if 'text' not in content or not content['text']:
            htm = Cleaner(links=False, style=True).clean_html(htm)
            text = '\n'.join(htm.xpath('//text()'))
            content['text'] = text.strip()

    content['text'] = content.get('text') or ''
    return content

예제 #3

파일 보기

파일: parser.py 프로젝트: TimofonicJunkRoom/mailur

def parse_part(env, part, msg_id, inner=False):
    content = OrderedDict([
        ('files', []),
        ('attachments', []),
        ('embedded', {}),
        ('html', '')
    ])

    ctype = part.get_content_type()
    mtype = part.get_content_maintype()
    stype = part.get_content_subtype()
    if part.is_multipart():
        for m in part.get_payload():
            child = parse_part(env, m, msg_id, True)
            child_html = child.pop('html', '')
            child_text = child.pop('text', '')
            content.setdefault('html', '')
            content.setdefault('text', '')
            if stype != 'alternative':
                content['html'] += child_html
                content['text'] += child_text
            else:
                content['html'] = child_html or content['html']
                content['text'] = child_text or content['text']
            content['files'] += child.pop('files')
            content.update(child)
    elif mtype == 'multipart':
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        content['html'] = text
    elif ctype in ['text/html', 'text/plain']:
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        if ctype == 'text/html':
            content['html'] = text
        elif ctype == 'text/plain' and not content['html']:
            text = html.unescape(text)
            text = re.sub(r'<[^>]+>', '', text)
            content['text'] = text
            text = text2html(text)
            content['html'] = text
    else:
        payload = part.get_payload(decode=True)
        filename = part.get_filename()
        filename = decode_header(filename, msg_id) if filename else ctype
        attachment = {
            'mimetype': ctype,
            'id': part.get('Content-ID'),
            'filename': filename,
            'payload': payload,
            'size': len(payload) if payload else None
        }
        content['files'] += [attachment]

    content['html'] = clean_html(content['html'])
    if inner:
        return content

    content.update(attachments=[], embedded={})
    for index, item in enumerate(content['files']):
        if item['payload']:
            name = slugify(item['filename'] or item['id'])
            path = '/'.join([slugify(msg_id), str(index), name])
            asset = env.files.to_db(path, item['mimetype'], item['filename'])
            if item['id']:
                content['embedded'][item['id']] = asset
            elif item['filename']:
                content['attachments'].append(asset)
            else:
                log.warn('UnknownAttachment(%s)', msg_id)
                continue

            env.files.write(path, item['payload'])

    if content['html']:
        htm = lh.fromstring(content['html'])

        # Fix img[@src]
        embedded = dict(content['embedded'])
        for img in htm.xpath('//img[@src]'):
            src = img.attrib.get('src')

            cid = re.match('^cid:(.*)', src)
            obj = cid and embedded.pop('<%s>' % cid.group(1), None)
            if obj:
                cid = cid.group(1)
                img.attrib['src'] = env.files.url(obj['path'])
            elif not re.match('^(https?://|/|data:image/).*', src):
                del img.attrib['src']
        content['attachments'] += embedded.values()

        content['html'] = lh.tostring(htm, encoding='utf-8').decode()
        if 'text' not in content or not content['text']:
            htm = Cleaner(links=False, style=True).clean_html(htm)
            text = '\n'.join(htm.xpath('//text()'))
            content['text'] = text.strip()

    content['text'] = content.get('text') or ''
    return content