Пример #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Webcrawler that obtains urls.')
    parser.add_argument('--url',
                        default="https://www.derivative-calculator.net/",
                        help='the start url')
    parser.add_argument('--amount', default=1000, help='')
    parser.add_argument('--clean',
                        default=False,
                        action='store_true',
                        help='removes urls.json file')
    args = parser.parse_args()
    if args.clean and os.path.exists("urls.json"):
        os.remove("urls.json")
    webcrawler = Webcrawler(args.url, args.amount)
    webcrawler.start_crawl()
Пример #2
0
 def test_initialize(self):
     crawler1 = Webcrawler(test_Webcrawler.default_url)
     pagenode0 = PageNode(crawler1.url, crawler1.response)
     self.assertTrue(hasattr(pagenode0, '_parent'))
     self.assertTrue(hasattr(pagenode0, '_children'))
     self.assertTrue(hasattr(pagenode0, 'sitelinks'))
     self.assertTrue(hasattr(pagenode0, 'externallinks'))
     self.assertTrue(hasattr(pagenode0, 'contentlinks'))
     self.assertTrue(hasattr(pagenode0, 'htmlcontent'))
     self.assertTrue(hasattr(pagenode0, 'javascriptlinks'))
     self.assertTrue(hasattr(pagenode0, 'csslinks'))
Пример #3
0
 def test_initialize(self):
     crawler0 = Webcrawler()
     self.assertTrue(hasattr(crawler0, 'url'))
     self.assertTrue(hasattr(crawler0, 'sitemap'))
     self.assertTrue(hasattr(crawler0, 'pagenodes'))
     self.assertTrue(hasattr(crawler0, 'externallinks'))
     self.assertTrue(hasattr(crawler0, 'contentlinks'))
     self.assertTrue(hasattr(crawler0, 'visited'))
     self.assertTrue(hasattr(crawler0, 'url_history'))
     self.assertTrue(hasattr(crawler0, 'response_history'))
     self.assertTrue(hasattr(crawler0, 'response'))
     self.assertTrue(hasattr(crawler0, 'success'))
Пример #4
0
    def test_get_request_urls(self):
        crawler0 = Webcrawler()
        self.assertIsNone(crawler0.response)

        crawler1 = Webcrawler(test_Webcrawler.default_url)
        self.assertIsNotNone(crawler1.response)

        crawler2 = Webcrawler()
        self.assertTrue(crawler2.get_url('http://google.com'))
        self.assertEqual(crawler2.url, 'http://google.com')

        crawler2.url = test_Webcrawler.default_url
        self.assertEqual(crawler2.url, test_Webcrawler.default_url)

        self.assertFalse(crawler2.get_url('asdljksadfljsldhg'))
        self.assertFalse(crawler2.success)
Пример #5
0
 def test_internal_links(self):
     crawler1 = Webcrawler(test_Webcrawler.default_url)
     pagenode0 = PageNode(crawler1.url, crawler1.response)
     self.assertIsNot(len(pagenode0.sitelinks), 0)
Пример #6
0
 def test_html_document(self):
     crawler1 = Webcrawler(test_Webcrawler.default_url)
     pagenode0 = PageNode(crawler1.url, crawler1.response)
     self.assertIsNotNone(pagenode0.response.content)
Пример #7
0
 def test_nodes_count(self):
     crawler1 = Webcrawler(test_Webcrawler.default_url)
     crawler2 = Webcrawler('http://google.com')
     pagenode0 = PageNode(crawler1.url, crawler1.response)
     self.assertEqual(pagenode0.num_nodes, 3)
     pagenode1 = PageNode(crawler2.url, crawler2.response)
Пример #8
0
 def test_defined_methods(self):
     crawler = Webcrawler()
     methods = ['get_url']
     for m in methods:
         self.assertTrue(callable(getattr(crawler, m)))
Пример #9
0
from Webcrawler import Webcrawler

crawler = Webcrawler("http://wiprodigital.com")
print(crawler.discoverable)
print(" ")
print(crawler.pagenodes[0].sitelinks)
print(" ")
print(crawler.pagenodes[0].externallinks)
print(" ")
print(crawler.pagenodes[0].contentlinks)
print(" ")
print(crawler.visited)
print(" ")
print(crawler.sitemap)