def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg')) ) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def scrape_document(self, item_session): response = item_session.response url_info = item_session.request.url_info url = url_info.raw if response_body_size(response) < 30 * 1024 * 1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = dupespotter.process_body(body, url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print("DUPE {}\n OF {}".format(url, dupe_of)) return else: if dupes_db is not None: dupes_db.set_old_url(digest, url) super().scrape_document(item_session)
def _append_filename_extension(self, response): '''Append an HTML/CSS file suffix as needed.''' if not self._filename: return if response.request.url_info.scheme not in ('http', 'https'): return if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \ HTMLReader.is_response(response): self._filename += '.html' elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \ CSSReader.is_response(response): self._filename += '.css'
def _append_filename_extension(self, response: BaseResponse): '''Append an HTML/CSS file suffix as needed.''' if not self._filename: return if response.request.url_info.scheme not in ('http', 'https'): return if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \ HTMLReader.is_response(response): self._filename += '.html' elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \ CSSReader.is_response(response): self._filename += '.css'
def scrape_document(self, request, response, url_item): if response.body.size() < 30*1024*1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = dupespotter.process_body(body, response.request.url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print(" DUPE {}\n OF {}".format(response.request.url, dupe_of)) return else: if dupes_db is not None: dupes_db.set_old_url(digest, response.request.url) super().scrape_document(request, response, url_item)
def scrape_document(self, request, response, url_item): if response.body.size() < 30*1024*1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = archivebot.dupespotter.dupespotter.process_body(body, response.request.url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print(" DUPE {}\n OF {}".format(response.request.url, dupe_of)) sys.stdout.flush() return else: if dupes_db is not None: dupes_db.set_old_url(digest, response.request.url) super().scrape_document(request, response, url_item)