def process(doc, params): url = params['url'] html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir='/tmp/') img_src = urljoin(url, img.get('src')) img_name = None if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) img_name = get_filename_from_url(img_src) write_file(r, fp) else: img_meta, content = img_src.split(',') image = base64.b64decode(content) img_name = get_filename_from_base64(img_meta) fp.write(image) images.append((img_name, fp)) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': {'url': url}}, ) html = '<h1>' + title + '</h1>' + summary html = '<p>{}</p>'.format(html) text = html2text.html2text(html) return text, images, 1, None
def process(doc, url): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR) img_src = urljoin(url, img.get('src')) if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) write_file(r, fp) else: image = base64.b64decode(img_src.split(',')[1]) fp.write(image) images.append(fp) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': { 'url': url }}, ) html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images, 1
def process(doc): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(img.get('src'), stream=True) write_file(r, fp) images.append(fp) except Exception: pass html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images
def simplify(self): if not self.doc: raise StripError("Not a html document") html_body = Document(self.doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(img.get('src'), stream=True) write_file(r, fp) images.append(fp) except Exception: pass html = "<h1>" + title + "</h1>" + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = regex.sub('', html) return html, images