Exemplos de Cleaner.xpath em Python, exemplos de lxml.html.clean.Cleaner.xpath em Python

Exemplo n.º 1

0

Exibir arquivo

def get_movies(site_content):
    logger().info("Get movies")

    movies_section: str = MOVIES_POSTER
    movies: dict[str:list] = {}

    html_site_elements_content = Cleaner(style=True).clean_html(
        html.fromstring(site_content))

    for movie in html_site_elements_content.xpath(
            MOVIE_TICKETS_BLOCK):  # type: HtmlElement
        movie_block_tag = movie.tag
        if movie_block_tag == DIV_TAG:
            movies_section = movie.xpath(
                XPATH_GET_TEXT.format(xpath=MOVIES_TITLE_BLOCK))[0].strip()
            continue
        movies_poster = [] if movies.get(
            movies_section) is None else movies[movies_section]
        movies[movies_section] = movies_poster + fetch_movies(movie)

    return movies

Exemplo n.º 2

0

Exibir arquivo

def parse_part(env, part, msg_id, inner=False):
    content = OrderedDict([('files', []), ('attachments', []),
                           ('embedded', {}), ('html', '')])

    ctype = part.get_content_type()
    mtype = part.get_content_maintype()
    stype = part.get_content_subtype()
    if part.is_multipart():
        for m in part.get_payload():
            child = parse_part(env, m, msg_id, True)
            child_html = child.pop('html', '')
            child_text = child.pop('text', '')
            content.setdefault('html', '')
            content.setdefault('text', '')
            if stype != 'alternative':
                content['html'] += child_html
                content['text'] += child_text
            else:
                content['html'] = child_html or content['html']
                content['text'] = child_text or content['text']
            content['files'] += child.pop('files')
            content.update(child)
    elif mtype == 'multipart':
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        content['html'] = text
    elif ctype in ['text/html', 'text/plain']:
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        if ctype == 'text/html':
            content['html'] = text
        elif ctype == 'text/plain' and not content['html']:
            text = html.unescape(text)
            text = re.sub(r'<[^>]+>', '', text)
            content['text'] = text
            text = text2html(text)
            content['html'] = text
    else:
        payload = part.get_payload(decode=True)
        filename = part.get_filename()
        filename = decode_header(filename, msg_id) if filename else ctype
        attachment = {
            'mimetype': ctype,
            'id': part.get('Content-ID'),
            'filename': filename,
            'payload': payload,
            'size': len(payload) if payload else None
        }
        content['files'] += [attachment]

    content['html'] = clean_html(content['html'])
    if inner:
        return content

    content.update(attachments=[], embedded={})
    for index, item in enumerate(content['files']):
        if item['payload']:
            name = slugify(item['filename'] or item['id'])
            path = '/'.join([slugify(msg_id), str(index), name])
            asset = env.files.to_db(path, item['mimetype'], item['filename'])
            if item['id']:
                content['embedded'][item['id']] = asset
            elif item['filename']:
                content['attachments'].append(asset)
            else:
                log.warn('UnknownAttachment(%s)', msg_id)
                continue

            env.files.write(path, item['payload'])

    if content['html']:
        htm = lh.fromstring(content['html'])

        # Fix img[@src]
        embedded = dict(content['embedded'])
        for img in htm.xpath('//img[@src]'):
            src = img.attrib.get('src')

            cid = re.match('^cid:(.*)', src)
            obj = cid and embedded.pop('<%s>' % cid.group(1), None)
            if obj:
                cid = cid.group(1)
                img.attrib['src'] = env.files.url(obj['path'])
            elif not re.match('^(https?://|/|data:image/).*', src):
                del img.attrib['src']
        content['attachments'] += embedded.values()

        content['html'] = lh.tostring(htm, encoding='utf-8').decode()
        if 'text' not in content or not content['text']:
            htm = Cleaner(links=False, style=True).clean_html(htm)
            text = '\n'.join(htm.xpath('//text()'))
            content['text'] = text.strip()

    content['text'] = content.get('text') or ''
    return content

Exemplo n.º 3

0

Exibir arquivo

Arquivo: parser.py Projeto: TimofonicJunkRoom/mailur

def parse_part(env, part, msg_id, inner=False):
    content = OrderedDict([
        ('files', []),
        ('attachments', []),
        ('embedded', {}),
        ('html', '')
    ])

    ctype = part.get_content_type()
    mtype = part.get_content_maintype()
    stype = part.get_content_subtype()
    if part.is_multipart():
        for m in part.get_payload():
            child = parse_part(env, m, msg_id, True)
            child_html = child.pop('html', '')
            child_text = child.pop('text', '')
            content.setdefault('html', '')
            content.setdefault('text', '')
            if stype != 'alternative':
                content['html'] += child_html
                content['text'] += child_text
            else:
                content['html'] = child_html or content['html']
                content['text'] = child_text or content['text']
            content['files'] += child.pop('files')
            content.update(child)
    elif mtype == 'multipart':
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        content['html'] = text
    elif ctype in ['text/html', 'text/plain']:
        text = part.get_payload(decode=True)
        text = decode_str(text, part.get_content_charset(), msg_id=msg_id)
        if ctype == 'text/html':
            content['html'] = text
        elif ctype == 'text/plain' and not content['html']:
            text = html.unescape(text)
            text = re.sub(r'<[^>]+>', '', text)
            content['text'] = text
            text = text2html(text)
            content['html'] = text
    else:
        payload = part.get_payload(decode=True)
        filename = part.get_filename()
        filename = decode_header(filename, msg_id) if filename else ctype
        attachment = {
            'mimetype': ctype,
            'id': part.get('Content-ID'),
            'filename': filename,
            'payload': payload,
            'size': len(payload) if payload else None
        }
        content['files'] += [attachment]

    content['html'] = clean_html(content['html'])
    if inner:
        return content

    content.update(attachments=[], embedded={})
    for index, item in enumerate(content['files']):
        if item['payload']:
            name = slugify(item['filename'] or item['id'])
            path = '/'.join([slugify(msg_id), str(index), name])
            asset = env.files.to_db(path, item['mimetype'], item['filename'])
            if item['id']:
                content['embedded'][item['id']] = asset
            elif item['filename']:
                content['attachments'].append(asset)
            else:
                log.warn('UnknownAttachment(%s)', msg_id)
                continue

            env.files.write(path, item['payload'])

    if content['html']:
        htm = lh.fromstring(content['html'])

        # Fix img[@src]
        embedded = dict(content['embedded'])
        for img in htm.xpath('//img[@src]'):
            src = img.attrib.get('src')

            cid = re.match('^cid:(.*)', src)
            obj = cid and embedded.pop('<%s>' % cid.group(1), None)
            if obj:
                cid = cid.group(1)
                img.attrib['src'] = env.files.url(obj['path'])
            elif not re.match('^(https?://|/|data:image/).*', src):
                del img.attrib['src']
        content['attachments'] += embedded.values()

        content['html'] = lh.tostring(htm, encoding='utf-8').decode()
        if 'text' not in content or not content['text']:
            htm = Cleaner(links=False, style=True).clean_html(htm)
            text = '\n'.join(htm.xpath('//text()'))
            content['text'] = text.strip()

    content['text'] = content.get('text') or ''
    return content