Exemplo n.º 1
0
def check_url(url, status):
  url_s = url.geturl()
  site = url.netloc
  robots_key = key_from_site(site)

  info('Checking: %s %s', url_s, status)
    
  if not robots_table.contains(robots_key):
    debug('Queueing robots fetch: %s', site)    
    robots_table.update(robots_key, RobotStatus.FETCHING)
    robots_queue.put(site)
    return
  
  if robots_table.get(robots_key) == RobotStatus.FETCHING:
    debug('Waiting for robot fetch: %s', url_s)
    return
  
  if not check_robots(url):
    debug('Blocked by robots "%s"', url_s)
    update_fetch_table(key_from_url(url), FetchStatus.ROBOTS_BLACKLIST, 1)
    return
    
  last_crawl = 0
  domain = domain_from_site(site)
  if crawltime_table.contains(domain):
    last_crawl = crawltime_table.get(domain)
  
  if now() - last_crawl < 60:
    debug('Waiting for politeness: %s, %d', url_s, now() - last_crawl)
  else:
    debug('Queueing: %s', url_s)
    crawl_queue.put(Page.create(url))
    fetch_table.update(key_from_url(url), FetchStatus.FETCHING)
    domain_counts.update(domain, 1)
    crawltime_table.update(domain, int(now()))
Exemplo n.º 2
0
def check_url(url, status):
    url_s = url.geturl()
    site = url.netloc
    robots_key = key_from_site(site)

    info('Checking: %s %s', url_s, status)

    if not robots_table.contains(robots_key):
        debug('Queueing robots fetch: %s', site)
        robots_table.update(robots_key, RobotStatus.FETCHING)
        robots_queue.put(site)
        return

    if robots_table.get(robots_key) == RobotStatus.FETCHING:
        debug('Waiting for robot fetch: %s', url_s)
        return

    if not check_robots(url):
        debug('Blocked by robots "%s"', url_s)
        update_fetch_table(key_from_url(url), FetchStatus.ROBOTS_BLACKLIST, 1)
        return

    last_crawl = 0
    domain = domain_from_site(site)
    if crawltime_table.contains(domain):
        last_crawl = crawltime_table.get(domain)

    if now() - last_crawl < 60:
        debug('Waiting for politeness: %s, %d', url_s, now() - last_crawl)
    else:
        debug('Queueing: %s', url_s)
        crawl_queue.put(Page.create(url))
        fetch_table.update(key_from_url(url), FetchStatus.FETCHING)
        domain_counts.update(domain, 1)
        crawltime_table.update(domain, int(now()))
Exemplo n.º 3
0
 def create(url):
   p = Page()
   p.domain = domain_from_site(url.netloc)
   p.url = url
   p.url_s = url.geturl()
   p.content = None
   p.outlinks = []
   return p
Exemplo n.º 4
0
def check_url(url, status):
  url_s = url.geturl()
  site = url.netloc
  robots_key = key_from_site(site)

  console('Checking: %s %s', url_s, status)
    
  if not robots_table.contains(robots_key):
    console('Queueing robots fetch: %s', site)    
    #if crawler_triggers():
    #  robots_table.enqueue_update(robots_key, RobotStatus.FETCHING)
    #else:
    robots_table.update(robots_key,RobotStatus.FETCHING)
    if not crawler_triggers():
      robots_queue.put(site)
    return FetchCheckStatus.PENDING
  
  if robots_table.get(robots_key) == RobotStatus.FETCHING:
    console('Waiting for robot fetch: %s', url_s)
    return FetchCheckStatus.PENDING
  
  if not check_robots(url):
    console('Blocked by robots "%s"', url_s)
    update_fetch_table(key_from_url(url), FetchStatus.ROBOTS_BLACKLIST, 1)
    return FetchCheckStatus.BLACKLIST
    
  last_crawl = 0
  domain = domain_from_site(site)
  if crawltime_table.contains(domain):
    last_crawl = crawltime_table.get(domain)
  
  if now() - last_crawl < POLITE_INTERVAL:
    console('Waiting for politeness: %s, %d' % (url_s, now() - last_crawl))
    return FetchCheckStatus.PENDING
  else:
    console('Queueing: %s', url_s)
    if not crawler_triggers():
      crawl_queue.put(Page.create(url))
    fetch_table.update(key_from_url(url), FetchStatus.FETCHING)
    domain_counts.update(domain, 1)
    crawltime_table.update(domain, int(now()))
    crawltime_table.SendUpdates()
    crawltime_table.HandlePutRequests()
    return FetchCheckStatus.CRAWLING
Exemplo n.º 5
0
def key_from_url(url): return domain_from_site(url.netloc) + ' ' + url.geturl()
def url_from_key(key): return urlparse(key[key.find(' ') + 1:])
Exemplo n.º 6
0
def key_from_site(site): return domain_from_site(site) + ' ' + site
  
def key_from_url(url): return domain_from_site(url.netloc) + ' ' + url.geturl()
Exemplo n.º 7
0
def key_from_site(site):
    return domain_from_site(site) + ' ' + site