def build_crawl_list(self): """ Build a list of all of the URLs based on the depth specified. """ current_depth = 1 page = requests.get(self.base_url).text if self.children > 0: self.urls = Page.get_urls(page)[:self.children] else: self.urls = Page.get_urls(page) # Below list holds previously scanned URLs, to stop URLs being added twice scanned_urls = [] while current_depth <= self.depth: # Append the links for each page then search it for more print 'Starting crawl depth', current_depth, 'with', len(self.urls), 'URLs to scan' new_urls = [] for url in self.urls: # If the url is not already scanned, and if it is not an image, xml etc. scan it. if url not in scanned_urls: if TasteDotCom.is_wanted_object(url): print 'Looking for child URLs in ', url markup = requests.get(url).text scanned_urls.append(url) if self.children > 0: new_urls = Page.get_urls(markup)[:self.children] else: new_urls = Page.get_urls(markup) print 'Found', len(new_urls), 'new pages' # for url in new_urls: # check_and_add(url) self.urls += new_urls current_depth += 1 print 'Finished crawling', self.base_url, 'found', len(self.urls), 'total URLs' # def run(self): # """ # Start Crawling the page specified # """ # #todo Make use of this method # print "Starting crawl session for", self.base_url # page = requests.get(self.base_url).text # child_urls = Page.get_urls(page) # for url in child_urls: # self.check_and_add(url) # def check_and_add(url): # pass
def test_get_urls(self): # Use a known static page for testing html = file("../miscellany/sausage_and_punpkin_curry.html").read() urls = Page.get_urls(html) self.assertEquals(len(urls), 387) a = re.compile(r"http://www\.[//\a\w\.\+]+") for url in urls: # Check each URL matches a hyperlink pattern self.assertTrue(a.match(url))