def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1", None, True) url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg" self.assertEqual(crawler.graph[url].request_type, "head")
def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True) target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html" print(crawler.graph[target_url]) self.assertIsNotNone(crawler.graph[target_url])
def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True) target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html" self.assertIsNotNone(crawler.graph[target_url])
def test_crawling(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"] .status, 'success') self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status_code, 404) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status, 'success') self.assertIn( "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes[ "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg"] .request_type, 'head')
def test_crawling(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler.graph.nodes) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler.graph.nodes) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"] .status, 'success') self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status_code, 404) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status, 'success')
def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/test4/", None, True) self.assertTrue( "https://triplebyte.github.io/web-crawler-test-site/test4/page3" in crawler.graph.nodes)
def test_crawling(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True) self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler.graph.nodes) self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler.graph.nodes) self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler.graph.nodes) self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"].status, 'success') self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"].status_code, 404) self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"].status, 'success')
def test_challenge(self): # The bug here is that the crawler will hang. Don't sit around waiting # for it to finish! crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test3/", None, True) self.assertIn( "http://blah.com:7091", crawler.graph.nodes )
def test_crawling_triplebyte(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl("https://www.triplebyte.com", None, True) self.assertIn("https://www.triplebyte.com", crawler.graph.nodes) self.assertIn("https://triplebyte.com/careers", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes["http://www.olark.com?welcome"].request_type, "head")
def main(): url = 'http://revistaautoesporte.globo.com/rss/ultimas/feed.xml' crawler = WebCrawler(url) data = crawler.build_data() crawler.data_to_file(data) print crawler.dump_data(data)
def main(): parser = argparse.ArgumentParser() parser.add_argument("target") parser.add_argument("--number_of_threads") parser.add_argument("--output_file") parser.add_argument("--verbose", help="increase output verbosity", action="store_true") args = parser.parse_args() webcrawler = WebCrawler(args.number_of_threads or 5, args.verbose and loggers.VerboseCrawlerLogger or loggers.SilentCrawlerLogger) webcrawler.crawl(args.target, args.output_file)
def get_crawler(uri: str, chrome_driver_path, dump_to_local): if parse.urlparse(uri).scheme in ( 'http', 'https', ): scraper = WebCrawler(uri, chrome_driver_path, dump_to_local) else: scraper = LocalCrawler(uri) return scraper
def main(): parser = argparse.ArgumentParser() parser.add_argument("target") parser.add_argument("--number_of_threads") parser.add_argument("--output_file") parser.add_argument("--verbose", help="increase output verbosity", action="store_true") args = parser.parse_args() webcrawler = WebCrawler( args.number_of_threads or 5, args.verbose and loggers.VerboseCrawlerLogger or loggers.SilentCrawlerLogger) webcrawler.crawl(args.target, args.output_file)
def get_one_day(self): params = { 'q': self.city, 'appid': self.api_key, 'lang': self.lang, 'units': self.units } uri = self.uri + "?" + urllib.urlencode(params, True) print uri data = WebCrawler.get_data(uri) print(data) self.decode_json(data) return self.speech_string
class TestWebCrawler(unittest.TestCase): def setUp(self) -> None: self.content_fetcher = unittest.mock.Mock() self.content_fetcher.retrieve_page.return_value = self.generate_mock_page( ) self.web_crawler = WebCrawler(self.content_fetcher) def generate_mock_page(self): return "<!DOCTYPE html>" \ "<html><body><h1>Some header</h1><p>Some text</p>" \ "<a href='http://some_link2.com'></a>" \ "<a href='http://some_link2.com'></a>" \ "</body></html>" def test_crawl_does_not_return_duplicate_links(self): urls = self.web_crawler.discover("http://some_link.com", limit=10) self.assertEqual(urls, ["http://some_link.com", "http://some_link2.com"]) def test_crawl_does_not_give_more_links_than_the_limit(self): urls = self.web_crawler.discover("http://some_link.com", limit=1) self.assertEqual(urls, ["http://some_link.com"])
def __init__(self, outfile, startUrl, limit, searchType, keyword): self.outfile = outfile self.startPage = startUrl self.limit = limit if keyword is None: self.keywordExists = False else: self.keywordExists = True self.keyword = keyword self.searchType = searchType self.currentLevel = 0 self.webCrawler = WebCrawler(keyword) self.idCount = -1 # 0 represents root level self.rootNode = PageNode(None, self.getUID(), startUrl, 0) self.activeNode = None self.rootError = None self.crawled = set() # seed the random integer generator for DFS method random.seed()
def setUp(self) -> None: self.content_fetcher = unittest.mock.Mock() self.content_fetcher.retrieve_page.return_value = self.generate_mock_page( ) self.web_crawler = WebCrawler(self.content_fetcher)
from parse import Parse from webcrawler import WebCrawler from interface import Interface if __name__ == "__main__": interface = Interface() parse = Parse() args = parse.get_parse() parse.do_parse(args) webcrawler = WebCrawler(parse) webcrawler.get_headers(interface.header_inter()) webcrawler.get_data(interface.data_inter()) webcrawler.get_url(interface.url_inter()) webcrawler.do_crawl()
############################################ # Parser for mystuwe.de # ############################################ from webcrawler import WebCrawler from stuweparser import StuweParser from datetime import * morgenstelle = "http://www.my-stuwe.de/mensa/mensa-morgenstelle-tuebingen" wilhelm = "http://www.my-stuwe.de/mensa/mensa-wilhelmstrasse-tuebingen/" alldaysWillhelm = "http://www.my-stuwe.de/mensa/mensa-wilhelmstrasse-tuebingen/?woche="+str(datetime.today().isocalendar()[1] + 1) alldays = "http://www.my-stuwe.de/mensa/mensa-morgenstelle-tuebingen/?woche="+str(datetime.today().isocalendar()[1] + 1) print("Crawling: " + alldays) crawler = WebCrawler(alldays) print("Crawling: " + alldaysWillhelm) crawler2 = WebCrawler(alldaysWillhelm) print("Start xml generation") parser = StuweParser(crawler.getHTML()) parser2 = StuweParser(crawler2.getHTML()) try: #parser.generateXML() parser.generateWeekXML("overviewMorgen.xml") parser2.generateWeekXML("overviewWillhelm.xml") print("XML generated") except Exception as e: print("An error occurred while generating xml file") print(e)
from webcrawler import WebCrawler import sys firefox = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'} try: user_input = sys.argv[1] except IndexError: user_input = False crawler = WebCrawler(user_input, firefox) #crawler.validate_url() crawler.get_amazon_image()
#!/usr/bin/python import sys from webcrawler import WebCrawler if __name__ == "__main__": website = 'https://pier31.co' if len(sys.argv) <= 1: print "\nYou didn't enter an address. Defaulting to %s" % website else: website = sys.argv[1] print "\nChoosen address: %s" % website web_crawler = WebCrawler(website) web_crawler.crawl_it()