Exemplo n.º 1
0
 def start(self):
     """
     Begin crawling
     :return: boolean - currently always true
     """
     while self.tocrawl:
         target = self.tocrawl.popleft()
         logging.info('\tparsing %s' % target)
         page = Page(target)
         result = {'url': page.url,
                   'assets': list(page.resources),
                   'links': list(page.links),
                   'form-targets': list(page.form_targets)}
         self.results.append(result)
         self.crawled.add(target)
         for link in page.links:
             if self.samedomain(target, link):
                 if not link in self.crawled and not link in self.tocrawl:
                     self.tocrawl.append(link)
     #ToDo: add error handling and return False if crawling fails
     return True
Exemplo n.º 2
0
 def test_form_with_button_with_formaction(self):
     target = "http://localhost:9999/hasformwithbuttonactionoverride.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.form_targets, set(["/formaction", "/button-override"]))
Exemplo n.º 3
0
 def test_form_without_action(self):
     target = "http://localhost:9999/hasformwithoutaction.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.form_targets, set(["http://localhost:9999/hasformwithoutaction.html"]))
Exemplo n.º 4
0
 def test_stylesheet(self):
     target = "http://localhost:9999/hasstylesheet.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.resources, set(["http://localhost:9999/stylesheet.css"]))
Exemplo n.º 5
0
 def test_script(self):
     target = "http://localhost:9999/hasscripts.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.resources, set(["http://localhost:9999/script.js"]))
Exemplo n.º 6
0
 def test_image(self):
     target = "http://localhost:9999/hasimage.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.resources, set(["http://localhost:9999/image.jpg"]))
Exemplo n.º 7
0
 def test_relative_link_in_a_folder(self):
     target = "http://localhost:9999/folder/inafolder.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.links, set(["http://localhost:9999/folder/nextpage.html"]))
Exemplo n.º 8
0
 def test_relative_link(self):
     target = "http://localhost:9999/page_relative_link.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.links, set(["http://localhost:9999/nextpage.html"]))
Exemplo n.º 9
0
 def test_server_relative_link(self):
     target = "http://localhost:9999/server_relative_link.html"
     target_url = urlparse(target)
     testlink = urlbuild((target_url.scheme, target_url.netloc, "/nextpage.html", "", "", ""))
     mypage = Page(target)
     self.assertSetEqual(mypage.links, set([testlink]))
Exemplo n.º 10
0
 def test_unsupported_scheme(self):
     target = "http://localhost:9999/unsupported_scheme.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.links, set([]))
Exemplo n.º 11
0
 def test_page_with_one_link(self):
     target = "http://localhost:9999/onelink.html"
     mypage = Page(target)
     self.assertSetEqual(mypage.links, set(["http://localhost/nextpage"]))
Exemplo n.º 12
0
 def test_page_without_valid_url_fails(self):
     target = "http://notarealexample.none/"
     mypage = Page(target)
     self.assertRaises(URLError)
Exemplo n.º 13
0
 def test_page_load_httpurl(self):
     target = "http://localhost:9999/"
     mypage = Page(target)
     self.assertEqual(mypage.url, target)
Exemplo n.º 14
0
 def test_form_with_multiple_formactions(self):
     target = "http://localhost:9999/hasformwithbuttonandinputactionoverride.html"
     mypage = Page(target)
     result = mypage.form_targets
     expected = set(["/formaction", "/button-override", "/input-override"])
     self.assertSetEqual(result, expected)
Exemplo n.º 15
0
 def test_form_with_input_with_formaction(self):
     target = 'http://localhost:9999/hasformwithactionoverride.html'
     mypage=Page(target)
     result = mypage.form_targets
     expected = set(["/formaction", "/overrideaction"])
     self.assertSetEqual(result, expected)