def configure_crawler(self, crawl_type, config_section): device = netifaces.gateways()['default'][netifaces.AF_INET][1] tbb_dir = os.path.abspath(cm.TBB_DIR) # Configure controller torrc_config = ut.get_dict_subconfig(self.config, config_section, "torrc") self.controller = TorController(tbb_dir, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(self.config, config_section, "ffpref") tbb_logfile_path = os.path.join(cm.LOGS_DIR, cm.FF_LOG_FILENAME) socks_port = int(torrc_config['socksport']) self.driver = TorBrowserDriver(tbb_dir, tbb_logfile_path=tbb_logfile_path, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=socks_port, canvas_allowed_hosts=[]) # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + crawl_type) screenshots = True self.crawler = crawl_type(self.driver, self.controller, device=device, screenshots=screenshots) # Configure job self.job_config = ut.get_dict_subconfig(self.config, config_section, "job") # Run display virtual_display = '' self.xvfb_display = setup_virtual_display(virtual_display)
def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController( cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs(args.url_file) # Read URLs url_list = parse_url_list(args.url_file, args.start, args.stop) # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport'])) # Instantiate crawler crawler = crawler_mod.Crawler(driver, controller, args.screenshots, args.device) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Setup stem headless display if args.virtual_display: xvfb_h = int(args.virtual_display.split('x')[0]) xvfb_w = int(args.virtual_display.split('x')[1]) else: xvfb_h = cm.DEFAULT_XVFB_WIN_H xvfb_w = cm.DEFAULT_XVFB_WIN_W xvfb_display = start_xvfb(xvfb_w, xvfb_h) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display stop_xvfb(xvfb_display) # die sys.exit(0)
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController( cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
def setUpClass(cls): cls.tor_controller = TorController(cm.TBB_DIR) cls.tor_process = cls.tor_controller.launch_tor_service()
class CrawlerTest(unittest.TestCase): def setUp(self): # clean dirs if isdir(TEST_DIRS): shutil.rmtree(TEST_DIRS) os.mkdir(TEST_DIRS) cm.CONFIG_FILE = os.path.join(cm.TEST_FILES_DIR, 'config.ini') self.config = ConfigParser.RawConfigParser() self.config.read(cm.CONFIG_FILE) def configure_crawler(self, crawl_type, config_section): device = netifaces.gateways()['default'][netifaces.AF_INET][1] tbb_dir = os.path.abspath(cm.TBB_DIR) # Configure controller torrc_config = ut.get_dict_subconfig(self.config, config_section, "torrc") self.controller = TorController(tbb_dir, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(self.config, config_section, "ffpref") tbb_logfile_path = os.path.join(cm.LOGS_DIR, cm.FF_LOG_FILENAME) socks_port = int(torrc_config['socksport']) self.driver = TorBrowserDriver(tbb_dir, tbb_logfile_path=tbb_logfile_path, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=socks_port, canvas_allowed_hosts=[]) # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + crawl_type) screenshots = True self.crawler = crawl_type(self.driver, self.controller, device=device, screenshots=screenshots) # Configure job self.job_config = ut.get_dict_subconfig(self.config, config_section, "job") # Run display virtual_display = '' self.xvfb_display = setup_virtual_display(virtual_display) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_crawl(self): self.configure_crawler('Base', 'captcha_test') job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl') self.run_crawl(job) # TODO: test for more conditions... self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_cloudflare_captcha_page(self): expected_pcaps = 2 self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_cloudflare_captcha_results') build_crawl_dirs() os.chdir(cm.CRAWL_DIR) try: self.crawler.crawl(job) # we can pass batch and instance numbers finally: self.driver.quit() self.controller.quit() capture_dirs = glob(os.path.join(cm.CRAWL_DIR, 'captcha_*')) self.assertEqual(expected_pcaps, len(capture_dirs)) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_not_captcha_after_captcha(self): self.configure_crawler('WebFP', 'captcha_test') known_captcha_url = 'https://cloudflare.com' known_not_captcha_url = 'https://check.torproject.org/' urls = [known_captcha_url, known_not_captcha_url] job = crawler_mod.CrawlJob(self.job_config, urls) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_not_captcha_after_captcha') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): marked_captcha = _dir.startswith('captcha_') is_torproject_dir = 'check.torproject.org' in _dir if is_torproject_dir: self.assertTrue(not marked_captcha) else: self.assertTrue(marked_captcha) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_captcha_not_captcha_2_batches(self): self.configure_crawler('WebFP', 'test_captcha_not_captcha_2_batches') known_captcha_url = 'https://cloudflare.com' known_not_captcha_url = 'https://check.torproject.org/' urls = [known_captcha_url, known_not_captcha_url] job = crawler_mod.CrawlJob(self.job_config, urls) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_not_captcha_after_captcha') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): marked_captcha = _dir.startswith('captcha_') is_torproject_dir = 'check.torproject.org' in _dir if is_torproject_dir: self.assertTrue(not marked_captcha) else: self.assertTrue(marked_captcha) shutil.rmtree(cm.CRAWL_DIR) def test_website_in_capture_dir(self): self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(cm.TEST_DIR, 'test_website_in_capture_dir') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): self.assertTrue('cloudflare.com' in _dir) shutil.rmtree(cm.CRAWL_DIR) def run_crawl(self, job): build_crawl_dirs() os.chdir(cm.CRAWL_DIR) try: self.crawler.crawl(job) # we can pass batch and instance numbers finally: self.driver.quit() self.controller.quit() #@pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_middle(self): self.configure_crawler('Middle', 'captcha_test') job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl') self.run_crawl(job) # TODO: test for more conditions... self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0) shutil.rmtree(cm.CRAWL_DIR)