def fake_response(link, content, **response_data): """A fake response that can be added to the mirror. """ redirects = response_data.pop('redirects', []) # Use the fake internet system to generate a response object. # This is more reliable than putting on together manually. data = {'stream': content} data.update(response_data) with internet(**{link.original_url: data}): session = TestSession() session.mount('http://', TestAdapter()) session.mount('https://', TestAdapter()) response = session.request('GET', link.original_url) # Additional attributes are expected. This is what the spider # does before passing a link to mirror.add(). Possibly we should # have less code duplication here with the actual spider code. parser_class = get_parser_for_mimetype(get_content_type(response)) if parser_class: response.parsed = parser_class(response.content, response.url, encoding=response.encoding) else: response.parsed = None response.links_parsed = HeaderLinkParser(response) response.redirects = redirects return response
def fake_response(link, content, **response_data): """A fake response that can be added to the mirror. """ redirects = response_data.pop("redirects", []) # Use the fake internet system to generate a response object. # This is more reliable than putting on together manually. data = {"stream": content} data.update(response_data) with internet(**{link.original_url: data}): session = TestSession() session.mount("http://", TestAdapter()) session.mount("https://", TestAdapter()) response = session.request("GET", link.original_url) # Additional attributes are expected. This is what the spider # does before passing a link to mirror.add(). Possibly we should # have less code duplication here with the actual spider code. parser_class = get_parser_for_mimetype(get_content_type(response)) if parser_class: response.parsed = parser_class(response.content, response.url, encoding=response.encoding) else: response.parsed = None response.links_parsed = HeaderLinkParser(response) response.redirects = redirects return response
def resolve(self, spider, type): content = self.content if hasattr(content, 'read'): content = self.content.read() # Return a fake response response = Response() response._content = content response.url = self.url response.status_code = 200 response.redirects = [] # Determine a mimetype for name in (self.url, self.filename): guessed_type = mimetypes.guess_type(name)[0] if guessed_type and get_parser_for_mimetype(guessed_type): found_type = guessed_type break else: # Assume a HTML file found_type = 'text/html' response.headers['content-type'] = found_type return response
def _convert_links_in_file(self, file, url, url_database): mimetype = self.url_info[url]['mimetype'] parser_class = get_parser_for_mimetype(mimetype) if not parser_class: return with self.open(file, 'rb+') as f: # A simple way to speed this up would also be to keep a # certain contingent of previously-parsed documents in memory. parsed = parser_class(f.read(), self.url_info[url].get('original_url', url), encoding=self.url_info[url].get('encoding')) def replace_link(raw_url): # Abuse the URL class to normalize the url for matching try: link = Link(raw_url) except urlnorm.InvalidUrl: return # See what we know about this link. Is the target url # saved locally? Is it a known redirect? local_filename = redir_url = redir_code = None if link.url in url_database: local_filename = url_database[link.url] else: if link.url in self.redirects: redir_code, redir_url = self.redirects[link.url] if redir_url in url_database: local_filename = url_database[redir_url] # We have the document behind this link available locally if local_filename: rel_link = path.relpath(local_filename, path.dirname(file)) if link.lossy_url_data.get('fragment'): rel_link += '#' + link.lossy_url_data['fragment'] return './{0}'.format(rel_link) # It is a permanent redirect, use the redirect target elif redir_url and redir_code == 301: return redir_url else: # We do not have a local copy. We need the make sure # we set an absolute url with a host part instead. # # We mustn't do this however for links that have # already previously been replaced with a local # link. We can find out if that is the case by # checking our url usage database. If the url is not # in it, then it must we one of ours. # TODO: Not sure if this is fool-proof, or if we could # in theory imagine a server-side link constructed in # such a way that a match would occur here. if link.url in self.url_usage: # The url has already been absolutized by the # parser, we can simply set it. return raw_url new_content = parsed.replace_urls(replace_link) # Write new file f.seek(0) f.write(new_content) f.truncate()
def _process_link(self, link): # Some links we are not supposed to follow, like <form action=> if link.info.get('do-not-follow'): self.events.follow_state_changed(link, skipped='no-download') return # Do not bother to process the same url twice if link.url in self._known_urls: self.events.follow_state_changed(link, skipped='duplicate') return # Test whether this is a link that we should even follow if link.source != 'user' and not self.rules.follow(link, self): self.events.follow_state_changed(link, skipped='rule-deny') return # Give the rules the option to skip the download, relying # on the information in the mirror instead. skip_download = self.rules.skip_download(link, self) if not skip_download: # Go ahead with the request response = link.resolve(self, 'full') if response is False: # This request failed at the connection stage if link.exception: if link.retries <= self.max_retries: self.events.follow_state_changed( link, failed='connect-error', exception=link.exception) link.retry() return True return False else: self.events.follow_state_changed(link, failed='redirect-error') return False # If we have been redirected to a different url, add that # url to the queue again. if response.redirects: redir_link = Link( response.redirects[-1].url, previous=link.previous, redirect_from=link.url, **link.info) self._link_queue.append(redir_link) self.events.added_to_queue(redir_link) response.close() # The mirror needs to know about the redirect. The status # code if the first redirect in a chain determines the type # (i.e. permanent, temporary etc) self.mirror.add_redirect( link, redir_link, response.status_code) self.events.follow_state_changed(link, failed='redirect') return # Do not follow errors if response.status_code >= 400: self.events.follow_state_changed(link, failed='http-error') return # If we have received a 304 not modified response, we are happy response_was_304 = response.status_code == 304 if response_was_304: self.events.follow_state_changed(link, failed='not-modified') else: self.events.follow_state_changed(link, success=True) else: # We did not download this url self.events.follow_state_changed(link, failed='not-expired') response = False # Attach a link parser now, which will start to work when needed. # The mirror might need the links during save, or the spider when # the @stop rules pass. Or we might get away without parsing. if response and not response_was_304: parser_class = get_parser_for_mimetype(get_content_type(response)) if parser_class: response.parsed = parser_class(response.content, response.url, encoding=response.encoding) else: response.parsed = None response.links_parsed = HeaderLinkParser(response) # Save the file locally? add_to_known_list = True if self.mirror: if not skip_download and not response_was_304: if isinstance(link, LocalFile): # Local files are used as starting points only, they # are not saved or otherwise treated as real. self.events.save_state_changed(link, saved=False) add_to_known_list = False elif self.rules.save(link, self): self.mirror.add(link, response) self.events.save_state_changed(link, saved=True) else: self.events.save_state_changed(link, saved=False) # TODO: This means that if a save rule is used, we will # re-download duplicate urls during the follow phase. # Maybe the duplicate check should happen at the # follow level. But then a rule like @save depth=3 would # not be reliable. # Possible solution: move the save test up, before we # do our regular fetch. We can then add_to_known_list = False else: # Mirror still needs to know we found this url so # it won't be deleted during cleanup. self.mirror.encounter_url(link) self.events.save_state_changed(link, saved=True) # No need to process this url again if add_to_known_list: self._known_urls.add(link.url) if link.info.get('redirect_from'): self._known_urls.add(link.info.get('redirect_from')) # Run a hook that makes it possible to stop now and ignore # all the urls contained in this page. if self.rules.stop(link, self): self.events.bail_state_changed(link, bail=True) return # Process follow up links. # # If we didn't properly download a full response, then the mirror # can tell us the urls that this page is pointing to. num_links_followed = num_links_total = 0 if skip_download or response_was_304: for link_url, info in self.mirror.url_info[link.url]['links']: num_links_total += 1 if self._add(link_url, previous=link, **info): num_links_followed += 1 else: # Add links from the parsed content + the http headers for link_url, opts in chain( response.links_parsed, response.parsed or ()): # Put together a url object with all the info that # we have ad that tests can use. num_links_total += 1 if self._add(link_url, previous=link, **opts): num_links_followed += 1 # Publish bail state. Include the number of links only if we found # some (say a http header link) or if this url was parsed for links # (in other words, don't sent 0 link counts for images). bail_extra = {} if num_links_total or getattr(response, 'parsed', False): bail_extra = dict(links_followed=num_links_followed, links_total=num_links_total) self.events.bail_state_changed(link, bail=False, **bail_extra)
def _convert_links_in_file(self, file, url, url_database): mimetype = self.url_info[url]['mimetype'] parser_class = get_parser_for_mimetype(mimetype) if not parser_class: return with self.open(file, 'rb+') as f: # A simple way to speed this up would also be to keep a # certain contingent of previously-parsed documents in memory. parsed = parser_class( f.read(), self.url_info[url].get('original_url', url), encoding=self.url_info[url].get('encoding')) def replace_link(raw_url): # Abuse the URL class to normalize the url for matching try: link = Link(raw_url) except urlnorm.InvalidUrl: return # See what we know about this link. Is the target url # saved locally? Is it a known redirect? local_filename = redir_url = redir_code = None if link.url in url_database: local_filename = url_database[link.url] else: if link.url in self.redirects: redir_code, redir_url = self.redirects[link.url] if redir_url in url_database: local_filename = url_database[redir_url] # We have the document behind this link available locally if local_filename: rel_link = path.relpath(local_filename, path.dirname(file)) if link.lossy_url_data.get('fragment'): rel_link += '#' + link.lossy_url_data['fragment'] return './{0}'.format(rel_link) # It is a permanent redirect, use the redirect target elif redir_url and redir_code == 301: return redir_url else: # We do not have a local copy. We need the make sure # we set an absolute url with a host part instead. # # We mustn't do this however for links that have # already previously been replaced with a local # link. We can find out if that is the case by # checking our url usage database. If the url is not # in it, then it must we one of ours. # TODO: Not sure if this is fool-proof, or if we could # in theory imagine a server-side link constructed in # such a way that a match would occur here. if link.url in self.url_usage: # The url has already been absolutized by the # parser, we can simply set it. return raw_url new_content = parsed.replace_urls(replace_link) # Write new file f.seek(0) f.write(new_content) f.truncate()