def process(doc, url): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR) img_src = urljoin(url, img.get('src')) if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) write_file(r, fp) else: image = base64.b64decode(img_src.split(',')[1]) fp.write(image) images.append(fp) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': { 'url': url }}, ) html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images, 1
def download(file_url, SUPPORTED_MIME_TYPES): """ Download/Export file from google drive params: file_url: file url from dropbox """ outfp = tempfile.NamedTemporaryFile() # TODO: verify url r = requests.get(file_url, stream=True, headers=DEFAULT_HEADERS) mime_type = r.headers["content-type"] write_file(r, outfp) return outfp, mime_type
def download(file_url, SUPPORTED_MIME_TYPES, exception=None): """ Download/Export file from google drive params: file_url: file url from dropbox """ outfp = tempfile.NamedTemporaryFile() # TODO: verify url r = requests.get(file_url, stream=True, headers=DEFAULT_HEADERS) mime_type = r.headers["content-type"] if mime_type in SUPPORTED_MIME_TYPES: write_file(r, outfp) return outfp, mime_type if exception: raise exception('Unsupported Mime Type: ' + mime_type)
def __init__(self, url): type = HTML doc = None params = {'url': url} try: r = requests.head(url, headers=DEFAULT_HEADERS, verify=False) except requests.exceptions.RequestException: # If we can't get header, assume html and try to continue. r = requests.get(url, headers=DEFAULT_HEADERS, verify=False) doc = r.content super().__init__(doc, type, params=params) return if not r.headers.get('content-type') or \ any(x in r.headers["content-type"] for x in self.HTML_TYPES): doc = get_web_info_extractor(url).get_content() else: fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR, delete=False) r = requests.get(url, stream=True, headers=DEFAULT_HEADERS, verify=False) write_file(r, fp) doc = fp if any(x in r.headers["content-type"] for x in self.PDF_TYPES): type = PDF elif any(x in r.headers["content-type"] for x in self.DOCX_TYPES): type = DOCX elif any(x in r.headers["content-type"] for x in self.PPTX_TYPES): type = PPTX super().__init__(doc, type, params=params)
def process(doc): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(img.get('src'), stream=True) write_file(r, fp) images.append(fp) except Exception: pass html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images
def __init__(self, url): type = HTML doc = None try: r = requests.head(url, headers=DEFAULT_HEADERS) except requests.exceptions.RequestException: # If we can't get header, assume html and try to continue. r = requests.get(url, headers=DEFAULT_HEADERS) doc = r.content super().__init__(doc, type) return if not r.headers.get('content-type') or \ any(x in r.headers["content-type"] for x in self.HTML_TYPES): r = requests.get(url, headers=DEFAULT_HEADERS) doc = r.content else: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(url, stream=True, headers=DEFAULT_HEADERS) write_file(r, fp) doc = fp if any(x in r.headers["content-type"] for x in self.PDF_TYPES): type = PDF elif any(x in r.headers["content-type"] for x in self.DOCX_TYPES): type = DOCX elif any(x in r.headers["content-type"] for x in self.PPTX_TYPES): type = PPTX super().__init__(doc, type)