def check_url(url, status): url_s = url.geturl() site = url.netloc robots_key = key_from_site(site) info('Checking: %s %s', url_s, status) if not robots_table.contains(robots_key): debug('Queueing robots fetch: %s', site) robots_table.update(robots_key, RobotStatus.FETCHING) robots_queue.put(site) return if robots_table.get(robots_key) == RobotStatus.FETCHING: debug('Waiting for robot fetch: %s', url_s) return if not check_robots(url): debug('Blocked by robots "%s"', url_s) update_fetch_table(key_from_url(url), FetchStatus.ROBOTS_BLACKLIST, 1) return last_crawl = 0 domain = domain_from_site(site) if crawltime_table.contains(domain): last_crawl = crawltime_table.get(domain) if now() - last_crawl < 60: debug('Waiting for politeness: %s, %d', url_s, now() - last_crawl) else: debug('Queueing: %s', url_s) crawl_queue.put(Page.create(url)) fetch_table.update(key_from_url(url), FetchStatus.FETCHING) domain_counts.update(domain, 1) crawltime_table.update(domain, int(now()))
def create(url): p = Page() p.domain = domain_from_site(url.netloc) p.url = url p.url_s = url.geturl() p.content = None p.outlinks = [] return p
def check_url(url, status): url_s = url.geturl() site = url.netloc robots_key = key_from_site(site) console('Checking: %s %s', url_s, status) if not robots_table.contains(robots_key): console('Queueing robots fetch: %s', site) #if crawler_triggers(): # robots_table.enqueue_update(robots_key, RobotStatus.FETCHING) #else: robots_table.update(robots_key,RobotStatus.FETCHING) if not crawler_triggers(): robots_queue.put(site) return FetchCheckStatus.PENDING if robots_table.get(robots_key) == RobotStatus.FETCHING: console('Waiting for robot fetch: %s', url_s) return FetchCheckStatus.PENDING if not check_robots(url): console('Blocked by robots "%s"', url_s) update_fetch_table(key_from_url(url), FetchStatus.ROBOTS_BLACKLIST, 1) return FetchCheckStatus.BLACKLIST last_crawl = 0 domain = domain_from_site(site) if crawltime_table.contains(domain): last_crawl = crawltime_table.get(domain) if now() - last_crawl < POLITE_INTERVAL: console('Waiting for politeness: %s, %d' % (url_s, now() - last_crawl)) return FetchCheckStatus.PENDING else: console('Queueing: %s', url_s) if not crawler_triggers(): crawl_queue.put(Page.create(url)) fetch_table.update(key_from_url(url), FetchStatus.FETCHING) domain_counts.update(domain, 1) crawltime_table.update(domain, int(now())) crawltime_table.SendUpdates() crawltime_table.HandlePutRequests() return FetchCheckStatus.CRAWLING
def key_from_url(url): return domain_from_site(url.netloc) + ' ' + url.geturl() def url_from_key(key): return urlparse(key[key.find(' ') + 1:])
def key_from_site(site): return domain_from_site(site) + ' ' + site def key_from_url(url): return domain_from_site(url.netloc) + ' ' + url.geturl()
def key_from_site(site): return domain_from_site(site) + ' ' + site