def main(): parser = argparse.ArgumentParser( description='Webcrawler that obtains urls.') parser.add_argument('--url', default="https://www.derivative-calculator.net/", help='the start url') parser.add_argument('--amount', default=1000, help='') parser.add_argument('--clean', default=False, action='store_true', help='removes urls.json file') args = parser.parse_args() if args.clean and os.path.exists("urls.json"): os.remove("urls.json") webcrawler = Webcrawler(args.url, args.amount) webcrawler.start_crawl()
def test_initialize(self): crawler1 = Webcrawler(test_Webcrawler.default_url) pagenode0 = PageNode(crawler1.url, crawler1.response) self.assertTrue(hasattr(pagenode0, '_parent')) self.assertTrue(hasattr(pagenode0, '_children')) self.assertTrue(hasattr(pagenode0, 'sitelinks')) self.assertTrue(hasattr(pagenode0, 'externallinks')) self.assertTrue(hasattr(pagenode0, 'contentlinks')) self.assertTrue(hasattr(pagenode0, 'htmlcontent')) self.assertTrue(hasattr(pagenode0, 'javascriptlinks')) self.assertTrue(hasattr(pagenode0, 'csslinks'))
def test_initialize(self): crawler0 = Webcrawler() self.assertTrue(hasattr(crawler0, 'url')) self.assertTrue(hasattr(crawler0, 'sitemap')) self.assertTrue(hasattr(crawler0, 'pagenodes')) self.assertTrue(hasattr(crawler0, 'externallinks')) self.assertTrue(hasattr(crawler0, 'contentlinks')) self.assertTrue(hasattr(crawler0, 'visited')) self.assertTrue(hasattr(crawler0, 'url_history')) self.assertTrue(hasattr(crawler0, 'response_history')) self.assertTrue(hasattr(crawler0, 'response')) self.assertTrue(hasattr(crawler0, 'success'))
def test_get_request_urls(self): crawler0 = Webcrawler() self.assertIsNone(crawler0.response) crawler1 = Webcrawler(test_Webcrawler.default_url) self.assertIsNotNone(crawler1.response) crawler2 = Webcrawler() self.assertTrue(crawler2.get_url('http://google.com')) self.assertEqual(crawler2.url, 'http://google.com') crawler2.url = test_Webcrawler.default_url self.assertEqual(crawler2.url, test_Webcrawler.default_url) self.assertFalse(crawler2.get_url('asdljksadfljsldhg')) self.assertFalse(crawler2.success)
def test_internal_links(self): crawler1 = Webcrawler(test_Webcrawler.default_url) pagenode0 = PageNode(crawler1.url, crawler1.response) self.assertIsNot(len(pagenode0.sitelinks), 0)
def test_html_document(self): crawler1 = Webcrawler(test_Webcrawler.default_url) pagenode0 = PageNode(crawler1.url, crawler1.response) self.assertIsNotNone(pagenode0.response.content)
def test_nodes_count(self): crawler1 = Webcrawler(test_Webcrawler.default_url) crawler2 = Webcrawler('http://google.com') pagenode0 = PageNode(crawler1.url, crawler1.response) self.assertEqual(pagenode0.num_nodes, 3) pagenode1 = PageNode(crawler2.url, crawler2.response)
def test_defined_methods(self): crawler = Webcrawler() methods = ['get_url'] for m in methods: self.assertTrue(callable(getattr(crawler, m)))
from Webcrawler import Webcrawler crawler = Webcrawler("http://wiprodigital.com") print(crawler.discoverable) print(" ") print(crawler.pagenodes[0].sitelinks) print(" ") print(crawler.pagenodes[0].externallinks) print(" ") print(crawler.pagenodes[0].contentlinks) print(" ") print(crawler.visited) print(" ") print(crawler.sitemap)