예제 #1
0
    def build_crawl_list(self):
        """
        Build a list of all of the URLs based on the depth specified.
        """
        current_depth = 1
        page = requests.get(self.base_url).text
        if self.children > 0:
            self.urls = Page.get_urls(page)[:self.children]
        else:
            self.urls = Page.get_urls(page)
        # Below list holds previously scanned URLs, to stop URLs being added twice
        scanned_urls = []
        while current_depth <= self.depth:
            # Append the links for each page then search it for more
            print 'Starting crawl depth', current_depth, 'with', len(self.urls), 'URLs to scan'
            new_urls = []
            for url in self.urls:
                # If the url is not already scanned, and if it is not an image, xml etc. scan it.
                if url not in scanned_urls:
                    if TasteDotCom.is_wanted_object(url):
                        print 'Looking for child URLs in ', url
                        markup = requests.get(url).text
                        scanned_urls.append(url)
                        if self.children > 0:
                            new_urls = Page.get_urls(markup)[:self.children]
                        else:
                            new_urls = Page.get_urls(markup)
            print 'Found', len(new_urls), 'new pages'
            # for url in new_urls:
            #     check_and_add(url)
            self.urls += new_urls
            current_depth += 1
        print 'Finished crawling', self.base_url, 'found', len(self.urls), 'total URLs'

    # def run(self):
    #     """
    #     Start Crawling the page specified
    #     """
    #     #todo Make use of this method
    #     print "Starting crawl session for", self.base_url
    #     page = requests.get(self.base_url).text
    #     child_urls = Page.get_urls(page)
    #     for url in child_urls:
    #         self.check_and_add(url)

# def check_and_add(url):
#     pass
예제 #2
0
 def test_get_urls(self):
     # Use a known static page for testing
     html = file("../miscellany/sausage_and_punpkin_curry.html").read()
     urls = Page.get_urls(html)
     self.assertEquals(len(urls), 387)
     a = re.compile(r"http://www\.[//\a\w\.\+]+")
     for url in urls:
         # Check each URL matches a hyperlink pattern
         self.assertTrue(a.match(url))