def _schedule_once(self, request, domain, score=0.1): """Accept a request object, justify its score and schedule it. The method schedules a request as-is w/o any modifications (except for score), make sure you have set all needed headers/metadata/etc before calling it. """ robotparser = domain.get('_rp') if domain is not None else None if robotparser and not robotparser.can_fetch(self.user_agent, request.url): return False if request.meta[b'state'] != States.NOT_CRAWLED: return False hostname = urlsplit(request.url).hostname # hostname is already lower-cased if not hostname: self.logger.warning("Can't parse hostname for '%s'", repr(request.url)) return False final_score = justify_request_score_by_hostname(hostname, score) self.schedule(request, final_score) request.meta[b'state'] = States.QUEUED return True
def filter_extracted_links(self, request, links): netloc, level_2nd_name, domain = self._get_domain_after_redirects(request) if is_domain_to_ignore(domain, max_pages=self.max_pages): return [] robotparser = domain.get('_rp') chosen_links = [] for link in links: if not self._is_from_same_domain(level_2nd_name, link): continue # validate that robots.txt allows to parse it (if defined) if robotparser and not robotparser.can_fetch(self.user_agent, link.url): continue chosen_links.append(link) # maybe ban the domain if it's eligible for ban link_netloc = urlsplit(link.url).netloc link_hostname, _, _ = link_netloc.partition(':') link_2nd_level, link_domain = self._get_domain(link_netloc) subdomains = link_domain.setdefault('subdomains', set()) subdomains.add(link_hostname) return chosen_links