示例#1
0
def fake_response(link, content, **response_data):
    """A fake response that can be added to the mirror.
    """
    redirects = response_data.pop('redirects', [])
    # Use the fake internet system to generate a response object.
    # This is more reliable than putting on together manually.
    data = {'stream': content}
    data.update(response_data)
    with internet(**{link.original_url: data}):
        session = TestSession()
        session.mount('http://', TestAdapter())
        session.mount('https://', TestAdapter())
        response = session.request('GET', link.original_url)

    # Additional attributes are expected. This is what the spider
    # does before passing a link to mirror.add(). Possibly we should
    # have less code duplication here with the actual spider code.
    parser_class = get_parser_for_mimetype(get_content_type(response))
    if parser_class:
        response.parsed = parser_class(response.content,
                                       response.url,
                                       encoding=response.encoding)
    else:
        response.parsed = None
    response.links_parsed = HeaderLinkParser(response)
    response.redirects = redirects
    return response
示例#2
0
def fake_response(link, content, **response_data):
    """A fake response that can be added to the mirror.
    """
    redirects = response_data.pop("redirects", [])
    # Use the fake internet system to generate a response object.
    # This is more reliable than putting on together manually.
    data = {"stream": content}
    data.update(response_data)
    with internet(**{link.original_url: data}):
        session = TestSession()
        session.mount("http://", TestAdapter())
        session.mount("https://", TestAdapter())
        response = session.request("GET", link.original_url)

    # Additional attributes are expected. This is what the spider
    # does before passing a link to mirror.add(). Possibly we should
    # have less code duplication here with the actual spider code.
    parser_class = get_parser_for_mimetype(get_content_type(response))
    if parser_class:
        response.parsed = parser_class(response.content, response.url, encoding=response.encoding)
    else:
        response.parsed = None
    response.links_parsed = HeaderLinkParser(response)
    response.redirects = redirects
    return response
示例#3
0
    def resolve(self, spider, type):
        content = self.content
        if hasattr(content, 'read'):
            content = self.content.read()

        # Return a fake response
        response = Response()
        response._content = content
        response.url = self.url
        response.status_code = 200
        response.redirects = []

        # Determine a mimetype
        for name in (self.url, self.filename):
            guessed_type = mimetypes.guess_type(name)[0]
            if guessed_type and get_parser_for_mimetype(guessed_type):
                found_type = guessed_type
                break
        else:
            # Assume a HTML file
            found_type = 'text/html'
        response.headers['content-type'] = found_type

        return response
示例#4
0
    def _convert_links_in_file(self, file, url, url_database):
        mimetype = self.url_info[url]['mimetype']
        parser_class = get_parser_for_mimetype(mimetype)
        if not parser_class:
            return

        with self.open(file, 'rb+') as f:
            # A simple way to speed this up would also be to keep a
            # certain contingent of previously-parsed documents in memory.
            parsed = parser_class(f.read(),
                                  self.url_info[url].get('original_url', url),
                                  encoding=self.url_info[url].get('encoding'))

            def replace_link(raw_url):
                # Abuse the URL class to normalize the url for matching
                try:
                    link = Link(raw_url)
                except urlnorm.InvalidUrl:
                    return

                # See what we know about this link. Is the target url
                # saved locally? Is it a known redirect?
                local_filename = redir_url = redir_code = None
                if link.url in url_database:
                    local_filename = url_database[link.url]
                else:
                    if link.url in self.redirects:
                        redir_code, redir_url = self.redirects[link.url]
                        if redir_url in url_database:
                            local_filename = url_database[redir_url]

                # We have the document behind this link available locally
                if local_filename:
                    rel_link = path.relpath(local_filename, path.dirname(file))
                    if link.lossy_url_data.get('fragment'):
                        rel_link += '#' + link.lossy_url_data['fragment']
                    return './{0}'.format(rel_link)

                # It is a permanent redirect, use the redirect target
                elif redir_url and redir_code == 301:
                    return redir_url

                else:
                    # We do not have a local copy. We need the make sure
                    # we set an absolute url with a host part instead.
                    #
                    # We mustn't do this however for links that have
                    # already previously been replaced with a local
                    # link. We can find out if that is the case by
                    # checking our url usage database. If the url is not
                    # in it, then it must we one of ours.
                    # TODO: Not sure if this is fool-proof, or if we could
                    # in theory imagine a server-side link constructed in
                    # such a way that a match would occur here.
                    if link.url in self.url_usage:
                        # The url has already been absolutized by the
                        # parser,  we can simply set it.
                        return raw_url

            new_content = parsed.replace_urls(replace_link)

            # Write new file
            f.seek(0)
            f.write(new_content)
            f.truncate()
示例#5
0
    def _process_link(self, link):
        # Some links we are not supposed to follow, like <form action=>
        if link.info.get('do-not-follow'):
            self.events.follow_state_changed(link, skipped='no-download')
            return

        # Do not bother to process the same url twice
        if link.url in self._known_urls:
            self.events.follow_state_changed(link, skipped='duplicate')
            return

        # Test whether this is a link that we should even follow
        if link.source != 'user' and not self.rules.follow(link, self):
            self.events.follow_state_changed(link, skipped='rule-deny')
            return

        # Give the rules the option to skip the download, relying
        # on the information in the mirror instead.
        skip_download = self.rules.skip_download(link, self)

        if not skip_download:
            # Go ahead with the request
            response = link.resolve(self, 'full')
            if response is False:
                # This request failed at the connection stage
                if link.exception:
                    if link.retries <= self.max_retries:
                        self.events.follow_state_changed(
                            link, failed='connect-error', exception=link.exception)
                        link.retry()
                        return True
                    return False
                else:
                    self.events.follow_state_changed(link, failed='redirect-error')
                    return False

            # If we have been redirected to a different url, add that
            # url to the queue again.
            if response.redirects:
                redir_link = Link(
                    response.redirects[-1].url, previous=link.previous,
                    redirect_from=link.url, **link.info)
                self._link_queue.append(redir_link)
                self.events.added_to_queue(redir_link)
                response.close()

                # The mirror needs to know about the redirect. The status
                # code if the first redirect in a chain determines the type
                # (i.e. permanent, temporary etc)
                self.mirror.add_redirect(
                    link, redir_link, response.status_code)

                self.events.follow_state_changed(link, failed='redirect')
                return

            # Do not follow errors
            if response.status_code >= 400:
                self.events.follow_state_changed(link, failed='http-error')
                return

            # If we have received a 304 not modified response, we are happy
            response_was_304 = response.status_code == 304
            if response_was_304:
                self.events.follow_state_changed(link, failed='not-modified')
            else:
                self.events.follow_state_changed(link, success=True)

        else:
            # We did not download this url
            self.events.follow_state_changed(link, failed='not-expired')
            response = False

        # Attach a link parser now, which will start to work when needed.
        # The mirror might need the links during save, or the spider when
        # the @stop rules pass. Or we might get away without parsing.
        if response and not response_was_304:
            parser_class = get_parser_for_mimetype(get_content_type(response))
            if parser_class:
                response.parsed = parser_class(response.content, response.url,
                                               encoding=response.encoding)
            else:
                response.parsed = None
            response.links_parsed = HeaderLinkParser(response)

        # Save the file locally?
        add_to_known_list = True
        if self.mirror:
            if not skip_download and not response_was_304:
                if isinstance(link, LocalFile):
                    # Local files are used as starting points only, they
                    # are not saved or otherwise treated as real.
                    self.events.save_state_changed(link, saved=False)
                    add_to_known_list = False
                elif self.rules.save(link, self):
                    self.mirror.add(link, response)
                    self.events.save_state_changed(link, saved=True)
                else:
                    self.events.save_state_changed(link, saved=False)
                    # TODO: This means that if a save rule is used, we will
                    # re-download duplicate urls during the follow phase.
                    # Maybe the duplicate check should happen at the
                    # follow level. But then a rule like @save depth=3 would
                    # not be reliable.
                    # Possible solution: move the save test up, before we
                    # do our regular fetch. We can then
                    add_to_known_list = False
            else:
                # Mirror still needs to know we found this url so
                # it won't be deleted during cleanup.
                self.mirror.encounter_url(link)
                self.events.save_state_changed(link, saved=True)

        # No need to process this url again
        if add_to_known_list:
            self._known_urls.add(link.url)
            if link.info.get('redirect_from'):
                self._known_urls.add(link.info.get('redirect_from'))

        # Run a hook that makes it possible to stop now and ignore
        # all the urls contained in this page.
        if self.rules.stop(link, self):
            self.events.bail_state_changed(link, bail=True)
            return

        # Process follow up links.
        #
        # If we didn't properly download a full response, then the mirror
        # can tell us the urls that this page is pointing to.
        num_links_followed = num_links_total = 0
        if skip_download or response_was_304:
            for link_url, info in self.mirror.url_info[link.url]['links']:
                num_links_total += 1
                if self._add(link_url, previous=link, **info):
                    num_links_followed += 1

        else:
            # Add links from the parsed content + the http headers
            for link_url, opts in chain(
                    response.links_parsed,
                    response.parsed or ()):
                # Put together a url object with all the info that
                # we have ad that tests can use.
                num_links_total += 1
                if self._add(link_url, previous=link, **opts):
                    num_links_followed += 1

        # Publish bail state. Include the number of links only if we found
        # some (say a http header link) or if this url was parsed for links
        # (in other words, don't sent 0 link counts for images).
        bail_extra = {}
        if num_links_total or getattr(response, 'parsed', False):
            bail_extra = dict(links_followed=num_links_followed, links_total=num_links_total)
        self.events.bail_state_changed(link, bail=False, **bail_extra)
示例#6
0
    def _convert_links_in_file(self, file, url, url_database):
        mimetype = self.url_info[url]['mimetype']
        parser_class = get_parser_for_mimetype(mimetype)
        if not parser_class:
            return

        with self.open(file, 'rb+') as f:
            # A simple way to speed this up would also be to keep a
            # certain contingent of previously-parsed documents in memory.
            parsed = parser_class(
                f.read(),
                self.url_info[url].get('original_url', url),
                encoding=self.url_info[url].get('encoding'))

            def replace_link(raw_url):
                # Abuse the URL class to normalize the url for matching
                try:
                    link = Link(raw_url)
                except urlnorm.InvalidUrl:
                    return

                # See what we know about this link. Is the target url
                # saved locally? Is it a known redirect?
                local_filename = redir_url = redir_code = None
                if link.url in url_database:
                    local_filename = url_database[link.url]
                else:
                    if link.url in self.redirects:
                        redir_code, redir_url = self.redirects[link.url]
                        if redir_url in url_database:
                            local_filename = url_database[redir_url]

                # We have the document behind this link available locally
                if local_filename:
                    rel_link = path.relpath(local_filename, path.dirname(file))
                    if link.lossy_url_data.get('fragment'):
                        rel_link += '#' + link.lossy_url_data['fragment']
                    return './{0}'.format(rel_link)

                # It is a permanent redirect, use the redirect target
                elif redir_url and redir_code == 301:
                    return redir_url

                else:
                    # We do not have a local copy. We need the make sure
                    # we set an absolute url with a host part instead.
                    #
                    # We mustn't do this however for links that have
                    # already previously been replaced with a local
                    # link. We can find out if that is the case by
                    # checking our url usage database. If the url is not
                    # in it, then it must we one of ours.
                    # TODO: Not sure if this is fool-proof, or if we could
                    # in theory imagine a server-side link constructed in
                    # such a way that a match would occur here.
                    if link.url in self.url_usage:
                        # The url has already been absolutized by the
                        # parser,  we can simply set it.
                        return raw_url
            new_content = parsed.replace_urls(replace_link)

            # Write new file
            f.seek(0)
            f.write(new_content)
            f.truncate()