Exemplo n.º 1
0
    def _schedule_once(self, request, domain, score=0.1):
        """Accept a request object, justify its score and schedule it.

        The method schedules a request as-is w/o any modifications (except for score),
        make sure you have set all needed headers/metadata/etc before calling it.
        """
        robotparser = domain.get('_rp') if domain is not None else None
        if robotparser and not robotparser.can_fetch(self.user_agent, request.url):
            return False
        if request.meta[b'state'] != States.NOT_CRAWLED:
            return False
        hostname = urlsplit(request.url).hostname  # hostname is already lower-cased
        if not hostname:
            self.logger.warning("Can't parse hostname for '%s'", repr(request.url))
            return False
        final_score = justify_request_score_by_hostname(hostname, score)
        self.schedule(request, final_score)
        request.meta[b'state'] = States.QUEUED
        return True
Exemplo n.º 2
0
 def filter_extracted_links(self, request, links):
     netloc, level_2nd_name, domain = self._get_domain_after_redirects(request)
     if is_domain_to_ignore(domain, max_pages=self.max_pages):
         return []
     robotparser = domain.get('_rp')
     chosen_links = []
     for link in links:
         if not self._is_from_same_domain(level_2nd_name, link):
             continue
         # validate that robots.txt allows to parse it (if defined)
         if robotparser and not robotparser.can_fetch(self.user_agent, link.url):
             continue
         chosen_links.append(link)
         # maybe ban the domain if it's eligible for ban
         link_netloc = urlsplit(link.url).netloc
         link_hostname, _, _ = link_netloc.partition(':')
         link_2nd_level, link_domain = self._get_domain(link_netloc)
         subdomains = link_domain.setdefault('subdomains', set())
         subdomains.add(link_hostname)
     return chosen_links