예제 #1
0
def process(doc, params):
    url = params['url']
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir='/tmp/')
            img_src = urljoin(url, img.get('src'))
            img_name = None
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                img_name = get_filename_from_url(img_src)
                write_file(r, fp)
            else:
                img_meta, content = img_src.split(',')
                image = base64.b64decode(content)
                img_name = get_filename_from_base64(img_meta)
                fp.write(image)
            images.append((img_name, fp))
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {'url': url}},
            )

    html = '<h1>' + title + '</h1>' + summary
    html = '<p>{}</p>'.format(html)

    text = html2text.html2text(html)
    return text, images, 1, None
예제 #2
0
파일: html.py 프로젝트: the-deep/server
def process(doc, url):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR)
            img_src = urljoin(url, img.get('src'))
            if re.search(r'http[s]?://', img_src):
                r = requests.get(img_src, stream=True)
                write_file(r, fp)
            else:
                image = base64.b64decode(img_src.split(',')[1])
                fp.write(image)
            images.append(fp)
        except Exception:
            logger.error(
                'extractor.formats.html Image Collector Error!!',
                exc_info=True,
                extra={'data': {
                    'url': url
                }},
            )

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images, 1
예제 #3
0
파일: html.py 프로젝트: eoglethorpe/deeper
def process(doc):
    html_body = Document(doc)
    summary = html_body.summary()
    title = html_body.short_title()
    images = []

    for img in html_body.reverse_tags(html_body.html, 'img'):
        try:
            fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
            r = requests.get(img.get('src'), stream=True)
            write_file(r, fp)
            images.append(fp)
        except Exception:
            pass

    html = '<h1>' + title + '</h1>' + summary

    regex = re.compile('\n*', flags=re.IGNORECASE)
    html = '<p>{}</p>'.format(regex.sub('', html))

    soup = BeautifulSoup(html, 'lxml')
    text = _get_plain_text(soup)
    return text, images
예제 #4
0
    def simplify(self):
        if not self.doc:
            raise StripError("Not a html document")

        html_body = Document(self.doc)
        summary = html_body.summary()
        title = html_body.short_title()
        images = []

        for img in html_body.reverse_tags(html_body.html, 'img'):
            try:
                fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR)
                r = requests.get(img.get('src'), stream=True)
                write_file(r, fp)
                images.append(fp)
            except Exception:
                pass

        html = "<h1>" + title + "</h1>" + summary

        regex = re.compile('\n*', flags=re.IGNORECASE)
        html = regex.sub('', html)
        return html, images