def test_get_out_filename_from_url(self): url = 'http://youtube.com' prefix = '2' self.assertEqual(fu.get_out_filename_from_url(url, prefix), '%s-http-youtube-com.txt' % (prefix)) self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.txt'), '%s-http-youtube-com.txt' % (prefix)) self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.log'), '%s-http-youtube-com.log' % (prefix))
def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id']) except Exception as exc: wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]): # TODO: add support for normal browsers if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']: br = ag.ChromeAgent() else: br = ag.HeadlessAgent() if not agent_cfg.has_key("timeout"): agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT br.setOptions(agent_cfg) cr_job = ag.CrawlJob(br) cr_job.urls = [ urls, ] if isinstance(urls, basestring) else urls cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls) ag.run_crawl(cr_job) self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!') for idx, url in enumerate(cr_job.urls): outfile = os.path.join( cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx + 1))) self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile) self.assert_all_patterns_in_file(outfile, expected_strs) self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id']) except Exception as exc: wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]): # TODO: add support for normal browsers if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']: br = ag.ChromeAgent() else: br = ag.HeadlessAgent() if not agent_cfg.has_key("timeout"): agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT br.setOptions(agent_cfg) cr_job = ag.CrawlJob(br) cr_job.urls = [urls,] if isinstance(urls, basestring) else urls cr_job.url_tuples = zip(xrange(1, len(urls)+1), urls) ag.run_crawl(cr_job) self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!') for idx, url in enumerate(cr_job.urls): outfile = os.path.join(cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx+1))) self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile) self.assert_all_patterns_in_file(outfile, expected_strs) self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
def test_get_out_filename_from_url(self): url = 'http://youtube.com' prefix = '2' self.assertEqual(fu.get_out_filename_from_url(url, prefix), '%s-http-youtube-com.txt' % (prefix)) self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.txt'), '%s-http-youtube-com.txt' % (prefix)) self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.log'), '%s-http-youtube-com.log' % (prefix))