Python Document.reverse_tags 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: readability.readability

클래스/타입: Document

메소드/함수: reverse_tags

hotexamples.com에서의 예제들: 4

Python Document.reverse_tags - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 readability.readability.Document.reverse_tags에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Document(30)

short_title(30)

summary(30)

title(20)

encode(9)

replace(9)

reverse_tags(4)

content(3)

transform(2)

get_clean_html(2)

get_publish_date(2)

parse(2)

split(2)

text_content(1)

summary_with_metadata(1)

strip(1)

read(1)

seek(1)

lower(1)

get_text(1)

get_author(1)

find_all(1)

find(1)

encoding(1)

write(1)

예제 #1

파일 보기

def process(doc, params):
    url = params['url']
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir='/tmp/')
            img_src = urljoin(url, img.get('src'))
            img_name = None
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                img_name = get_filename_from_url(img_src)
                write_file(r, fp)
            else:
                img_meta, content = img_src.split(',')
                image = base64.b64decode(content)
                img_name = get_filename_from_base64(img_meta)
                fp.write(image)
            images.append((img_name, fp))
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {'url': url}},
            )

    html = '<h1>' + title + '</h1>' + summary
    html = '<p>{}</p>'.format(html)

    text = html2text.html2text(html)
    return text, images, 1, None

예제 #2

파일 보기

파일: html.py 프로젝트: the-deep/server

def process(doc, url):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR)
            img_src = urljoin(url, img.get('src'))
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                write_file(r, fp)
            else:
                image = base64.b64decode(img_src.split(',')[1])
                fp.write(image)
            images.append(fp)
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {
                    'url': url
                }},
            )

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images, 1

예제 #3

파일 보기

파일: html.py 프로젝트: eoglethorpe/deeper

def process(doc):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
            r = requests.get(img.get('src'), stream=True)
            write_file(r, fp)
            images.append(fp)
        except Exception:
            pass

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images

예제 #4

파일 보기

    def simplify(self):
        if not self.doc:
            raise StripError("Not a html document")

        html_body = Document(self.doc)
        summary = html_body.summary()
        title = html_body.short_title()
        images = []

        for img in html_body.reverse_tags(html_body.html, 'img'):
            try:
                fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
                r = requests.get(img.get('src'), stream=True)
                write_file(r, fp)
                images.append(fp)
            except Exception:
                pass

        html = "<h1>" + title + "</h1>" + summary

        regex = re.compile('\n*', flags=re.IGNORECASE)
        html = regex.sub('', html)
        return html, images