def test_document_without_assets_parses_ok(): html_document = ''' <html> <head></head> <body></body> </html> ''' assets = parser.extract_assets(html_document) assert assets == []
def test_assets_without_required_tags_parses_ok(): html_document = ''' <html> <head><link /></head> <body><a>Empty link</a></body> </html> ''' assets = parser.extract_assets(html_document) assert assets == []
def test_document_with_assets_parses_ok(): html_document = ''' <!DOCTYPE html> <html> <head> <title></title> <link rel="stylesheet" href="styles/app.css" /> <script src="scripts/app.js"></script> </head> <body> <a href="http://google.com">Here be dragons.</a> </body> </html> ''' assets = parser.extract_assets(html_document) assert assets == [ u'http://google.com', u'styles/app.css', u'scripts/app.js', ]
class Crawler(object): """ Web crawler which lists pages on a site and prepares a map of resources referenced from them. """ _starting_address = None _our_address = None _session = None def __init__(self, url): # We need to explicitly add scheme here, # because requests requires it if not url.startswith('http'): url = 'http://' + url self._starting_address = url self._our_address = urlparse(self._starting_address) self._session = requests.Session() def _is_url_local(self, url): """ Check if supplied URL is local relative to our starting point """ return not url.netloc or url.netloc == self._our_address.netloc def _canonicalize(self, url): """ Clean supplied URL, discarding query fragments (hashbangs) and parameters. Also we will try to resolve path traversal symbols ('.', '..'). This function won't try to improve URLs external to our starting point. """ parsed_path = url.path if not self._our_address: raise ValueError('Can\'t canonicalize URL without local address') # Must be external host, leave as it is if not self._is_url_local(url): return url # We need this to correctly compensate for '../' at root, # e.g. '../about.html' if not parsed_path.startswith('/'): parsed_path = '/' + parsed_path # No need to waste CPU if we resolve root path if parsed_path in ['.', '/']: return self._our_address # Compensate for '../' in URL resolved_path = posixpath.normpath(parsed_path) # There is something weird going on with trailing slashes # (see https://bugs.python.org/issue1707768) if url.path.endswith('/'): resolved_path += '/' canon_url = url._replace( scheme=self._our_address.scheme or 'http', path=resolved_path, netloc=self._our_address.netloc, fragment=None, ) return canon_url def _looks_like_page(self, parts): """ Heuristic detection of possible candidates for asset extraction. """ last_part = parts.path.rsplit('/')[-1] return any([ parts.path.endswith('/'), parts.path.endswith('.html'), parts.path.endswith('.htm'), last_part and '.' not in last_part, ]) def crawl_page(self, full_url): """ Retrieve requested URL and try to extract all available assets from received content. We will panic if anything other than 200. """ try: response = self._session.get(full_url) except Exception, e: raise CrawlerException(e.message) content_type = response.headers.get('Content-Type', 'text/html').split(';') if response.status_code != 200: raise CrawlerException('Unexpected status: {}'.format( response.status_code)) elif content_type[0] != 'text/html': raise CrawlerException( 'Unexpected Content-Type: {}'.format(content_type)) return full_url, extract_assets(response.content)
def test_malformed_document_parsing_fails(): html_document = '<html>Here goes nothing' assets = parser.extract_assets(html_document) assert assets == []