def start(self): """ Begin crawling :return: boolean - currently always true """ while self.tocrawl: target = self.tocrawl.popleft() logging.info('\tparsing %s' % target) page = Page(target) result = {'url': page.url, 'assets': list(page.resources), 'links': list(page.links), 'form-targets': list(page.form_targets)} self.results.append(result) self.crawled.add(target) for link in page.links: if self.samedomain(target, link): if not link in self.crawled and not link in self.tocrawl: self.tocrawl.append(link) #ToDo: add error handling and return False if crawling fails return True
def test_form_with_button_with_formaction(self): target = "http://localhost:9999/hasformwithbuttonactionoverride.html" mypage = Page(target) self.assertSetEqual(mypage.form_targets, set(["/formaction", "/button-override"]))
def test_form_without_action(self): target = "http://localhost:9999/hasformwithoutaction.html" mypage = Page(target) self.assertSetEqual(mypage.form_targets, set(["http://localhost:9999/hasformwithoutaction.html"]))
def test_stylesheet(self): target = "http://localhost:9999/hasstylesheet.html" mypage = Page(target) self.assertSetEqual(mypage.resources, set(["http://localhost:9999/stylesheet.css"]))
def test_script(self): target = "http://localhost:9999/hasscripts.html" mypage = Page(target) self.assertSetEqual(mypage.resources, set(["http://localhost:9999/script.js"]))
def test_image(self): target = "http://localhost:9999/hasimage.html" mypage = Page(target) self.assertSetEqual(mypage.resources, set(["http://localhost:9999/image.jpg"]))
def test_relative_link_in_a_folder(self): target = "http://localhost:9999/folder/inafolder.html" mypage = Page(target) self.assertSetEqual(mypage.links, set(["http://localhost:9999/folder/nextpage.html"]))
def test_relative_link(self): target = "http://localhost:9999/page_relative_link.html" mypage = Page(target) self.assertSetEqual(mypage.links, set(["http://localhost:9999/nextpage.html"]))
def test_server_relative_link(self): target = "http://localhost:9999/server_relative_link.html" target_url = urlparse(target) testlink = urlbuild((target_url.scheme, target_url.netloc, "/nextpage.html", "", "", "")) mypage = Page(target) self.assertSetEqual(mypage.links, set([testlink]))
def test_unsupported_scheme(self): target = "http://localhost:9999/unsupported_scheme.html" mypage = Page(target) self.assertSetEqual(mypage.links, set([]))
def test_page_with_one_link(self): target = "http://localhost:9999/onelink.html" mypage = Page(target) self.assertSetEqual(mypage.links, set(["http://localhost/nextpage"]))
def test_page_without_valid_url_fails(self): target = "http://notarealexample.none/" mypage = Page(target) self.assertRaises(URLError)
def test_page_load_httpurl(self): target = "http://localhost:9999/" mypage = Page(target) self.assertEqual(mypage.url, target)
def test_form_with_multiple_formactions(self): target = "http://localhost:9999/hasformwithbuttonandinputactionoverride.html" mypage = Page(target) result = mypage.form_targets expected = set(["/formaction", "/button-override", "/input-override"]) self.assertSetEqual(result, expected)
def test_form_with_input_with_formaction(self): target = 'http://localhost:9999/hasformwithactionoverride.html' mypage=Page(target) result = mypage.form_targets expected = set(["/formaction", "/overrideaction"]) self.assertSetEqual(result, expected)