Exemplo n.º 1
0
def run():
    # Parse arguments
    args, config = parse_arguments()

    # build dirs
    build_crawl_dirs(args.url_file)

    # Read URLs
    url_list = parse_url_list(args.url_file, args.start, args.stop)

    # Configure logger
    add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(cm.TBB_DIR,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    driver = TorBrowserWrapper(cm.TBB_DIR,
                               tbb_logfile_path=cm.DEFAULT_FF_LOG,
                               tor_cfg=USE_RUNNING_TOR,
                               pref_dict=ffprefs,
                               socks_port=int(torrc_config['socksport']))

    # Instantiate crawler
    crawler = crawler_mod.Crawler(driver, controller, args.screenshots,
                                  args.device)

    # Configure crawl
    job_config = ut.get_dict_subconfig(config, args.config, "job")
    job = crawler_mod.CrawlJob(job_config, url_list)

    # Setup stem headless display
    if args.virtual_display:
        xvfb_h = int(args.virtual_display.split('x')[0])
        xvfb_w = int(args.virtual_display.split('x')[1])
    else:
        xvfb_h = cm.DEFAULT_XVFB_WIN_H
        xvfb_w = cm.DEFAULT_XVFB_WIN_W
    xvfb_display = start_xvfb(xvfb_w, xvfb_h)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    finally:
        # Post crawl
        post_crawl()

        # Close display
        stop_xvfb(xvfb_display)

    # die
    sys.exit(0)
 def test_crawl(self):
     self.configure_crawler('Base', 'captcha_test')
     job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST)
     cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl')
     self.run_crawl(job)
     # TODO: test for more conditions...
     self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0)
     shutil.rmtree(cm.CRAWL_DIR)
    def test_website_in_capture_dir(self):
        self.configure_crawler('WebFP', 'captcha_test')

        url = 'https://cloudflare.com/'
        job = crawler_mod.CrawlJob(self.job_config, [url])
        cm.CRAWL_DIR = os.path.join(cm.TEST_DIR,
                                    'test_website_in_capture_dir')
        self.run_crawl(job)

        for _dir in os.listdir(cm.CRAWL_DIR):
            self.assertTrue('cloudflare.com' in _dir)
        shutil.rmtree(cm.CRAWL_DIR)
    def test_captcha_not_captcha_2_batches(self):
        self.configure_crawler('WebFP', 'test_captcha_not_captcha_2_batches')

        known_captcha_url = 'https://cloudflare.com'
        known_not_captcha_url = 'https://check.torproject.org/'
        urls = [known_captcha_url, known_not_captcha_url]
        job = crawler_mod.CrawlJob(self.job_config, urls)
        cm.CRAWL_DIR = os.path.join(TEST_DIRS,
                                    'test_not_captcha_after_captcha')
        self.run_crawl(job)

        for _dir in os.listdir(cm.CRAWL_DIR):
            marked_captcha = _dir.startswith('captcha_')
            is_torproject_dir = 'check.torproject.org' in _dir
            if is_torproject_dir:
                self.assertTrue(not marked_captcha)
            else:
                self.assertTrue(marked_captcha)
        shutil.rmtree(cm.CRAWL_DIR)
    def test_cloudflare_captcha_page(self):
        expected_pcaps = 2

        self.configure_crawler('WebFP', 'captcha_test')

        url = 'https://cloudflare.com/'
        job = crawler_mod.CrawlJob(self.job_config, [url])
        cm.CRAWL_DIR = os.path.join(TEST_DIRS,
                                    'test_cloudflare_captcha_results')
        build_crawl_dirs()
        os.chdir(cm.CRAWL_DIR)
        try:
            self.crawler.crawl(job)  # we can pass batch and instance numbers
        finally:
            self.driver.quit()
            self.controller.quit()

        capture_dirs = glob(os.path.join(cm.CRAWL_DIR, 'captcha_*'))
        self.assertEqual(expected_pcaps, len(capture_dirs))
        shutil.rmtree(cm.CRAWL_DIR)