def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs(args.url_file) # Read URLs url_list = parse_url_list(args.url_file, args.start, args.stop) # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport'])) # Instantiate crawler crawler = crawler_mod.Crawler(driver, controller, args.screenshots, args.device) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Setup stem headless display if args.virtual_display: xvfb_h = int(args.virtual_display.split('x')[0]) xvfb_w = int(args.virtual_display.split('x')[1]) else: xvfb_h = cm.DEFAULT_XVFB_WIN_H xvfb_w = cm.DEFAULT_XVFB_WIN_W xvfb_display = start_xvfb(xvfb_w, xvfb_h) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display stop_xvfb(xvfb_display) # die sys.exit(0)
def test_crawl(self): self.configure_crawler('Base', 'captcha_test') job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl') self.run_crawl(job) # TODO: test for more conditions... self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0) shutil.rmtree(cm.CRAWL_DIR)
def test_website_in_capture_dir(self): self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(cm.TEST_DIR, 'test_website_in_capture_dir') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): self.assertTrue('cloudflare.com' in _dir) shutil.rmtree(cm.CRAWL_DIR)
def test_captcha_not_captcha_2_batches(self): self.configure_crawler('WebFP', 'test_captcha_not_captcha_2_batches') known_captcha_url = 'https://cloudflare.com' known_not_captcha_url = 'https://check.torproject.org/' urls = [known_captcha_url, known_not_captcha_url] job = crawler_mod.CrawlJob(self.job_config, urls) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_not_captcha_after_captcha') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): marked_captcha = _dir.startswith('captcha_') is_torproject_dir = 'check.torproject.org' in _dir if is_torproject_dir: self.assertTrue(not marked_captcha) else: self.assertTrue(marked_captcha) shutil.rmtree(cm.CRAWL_DIR)
def test_cloudflare_captcha_page(self): expected_pcaps = 2 self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_cloudflare_captcha_results') build_crawl_dirs() os.chdir(cm.CRAWL_DIR) try: self.crawler.crawl(job) # we can pass batch and instance numbers finally: self.driver.quit() self.controller.quit() capture_dirs = glob(os.path.join(cm.CRAWL_DIR, 'captcha_*')) self.assertEqual(expected_pcaps, len(capture_dirs)) shutil.rmtree(cm.CRAWL_DIR)