Пример #1
0
    def parse_seed(self, response):
        """
        Parse a webpage from the "seed" website.
        """
        ld = self._load_webpage_item(response, is_seed=True)

        if self.use_splash:
            self._process_splash_response(response, ld)

        yield ld.load_item()

        this_domain = get_domain(response.url)

        for link in self._get_links(response):
            domain = get_domain(link.url)

            if is_external_url(response.url, link.url):
                yield self._offsite_request(
                    response, link,
                    count_key=(this_domain, domain),
                    max_count=self.max_external_links_per_seed_per_domain
                )
            else:
                yield self._onsite_request(
                    response, link,
                    callback=self.parse,
                    max_depth=self.max_depth_seed,
                    count_key=domain,
                    max_count=self.max_internal_links_per_seed,
                )
Пример #2
0
    def parse_external(self, response):
        """
        Parse a webpage from an external website.
        """
        ld = self._load_webpage_item(response, is_seed=False)

        if self.use_splash:
            self._process_splash_response(response, ld)

        yield ld.load_item()

        for link in self._get_links(response):
            domain = get_domain(link.url)

            if is_external_url(response.url, link.url):
                # total number of hops is limited by settings.DEPTH_LIMIT
                yield self._offsite_request(
                    response, link,
                    count_key=domain,
                    max_count=self.max_external_links_per_domain
                )
            else:
                yield self._onsite_request(
                    response, link,
                    callback=self.parse_external,
                    max_depth=self.max_depth_external,
                    count_key=domain,
                    max_count=self.max_external_links_per_domain
                )
Пример #3
0
    def parse(self, response):
        if 'referrer_url' in response.meta:
            if is_external_url(response.url, response.meta['referrer_url']):
                # When we follow a link and it redirects to another domain
                # consider it external even if the link url was on-site.
                response.meta['link_depth'] = 0
                return self.parse_external(response)

        return self.parse_seed(response)