예제 #1
0
def test_page_graph(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None):
  handler = CrawlerHandler()
  home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0)
  #Looks for a page that doesn't exist
  pages_set = handler.page_graph(home_page + str(random()))
  assert(len(pages_set) == 0) 
  #looks for a page that DOES exist
  pages_set_1 = handler.page_graph(home_page)
  pages_set_2 = handler.page_graph()
  assert (pages_set_1 == pages_set_2)
  return pages_set_2
예제 #2
0
    def get(self, url):
      
      ip = self.request.remote_addr #to prevent abuses, only a request every minute is served
      
      request = Request.gql("where ip='%s'" % ip).get() #Look for request from the same IP address
      if not request is None:
        delta = request.is_allowed()
        
        if delta > 0: #too little time has passed from the previous request
          #self.error(408)   #Timeout Error
          self.response.set_status(408, "Your IP address has issued a request less than 1 min ago. Please wait %d seconds" % delta)
          return
      else:
        request = Request(ip=ip, page_crawled=url)
        request.save()
      
      self.response.headers['Content-Type'] = 'application/json'
      handler = CrawlerHandler()


      site_image = memcache.get(url)

      
      if site_image is None:
        home_page = handler.start_crawling(url, MAX_PAGE_DEPTH, MAX_PAGES_TO_CRAWL, 0.01)  #causes a little delay, but not too big (one 100th of a sec) 

        if home_page is None:
          self.error(400) #Bad Request
          return
        else:
          site_image = handler.page_graph(home_page)
          memcache.set(url, site_image)
      
      self.__responde(site_image)