def test_page_graph(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None): handler = CrawlerHandler() home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0) #Looks for a page that doesn't exist pages_set = handler.page_graph(home_page + str(random())) assert(len(pages_set) == 0) #looks for a page that DOES exist pages_set_1 = handler.page_graph(home_page) pages_set_2 = handler.page_graph() assert (pages_set_1 == pages_set_2) return pages_set_2
def get(self, url): ip = self.request.remote_addr #to prevent abuses, only a request every minute is served request = Request.gql("where ip='%s'" % ip).get() #Look for request from the same IP address if not request is None: delta = request.is_allowed() if delta > 0: #too little time has passed from the previous request #self.error(408) #Timeout Error self.response.set_status(408, "Your IP address has issued a request less than 1 min ago. Please wait %d seconds" % delta) return else: request = Request(ip=ip, page_crawled=url) request.save() self.response.headers['Content-Type'] = 'application/json' handler = CrawlerHandler() site_image = memcache.get(url) if site_image is None: home_page = handler.start_crawling(url, MAX_PAGE_DEPTH, MAX_PAGES_TO_CRAWL, 0.01) #causes a little delay, but not too big (one 100th of a sec) if home_page is None: self.error(400) #Bad Request return else: site_image = handler.page_graph(home_page) memcache.set(url, site_image) self.__responde(site_image)