예제 #1
0
 def test_get_out_filename_from_url(self):
     url = 'http://youtube.com'
     prefix = '2'
     self.assertEqual(fu.get_out_filename_from_url(url, prefix),
                      '%s-http-youtube-com.txt' % (prefix))
     self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.txt'),
                      '%s-http-youtube-com.txt' % (prefix))
     self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.log'),
                      '%s-http-youtube-com.log' % (prefix))
예제 #2
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
예제 #3
0
    def should_crawl_and_log(self,
                             agent_cfg,
                             urls,
                             expected_strs,
                             unexpected_strs=[]):
        # TODO: add support for normal browsers
        if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']:
            br = ag.ChromeAgent()
        else:
            br = ag.HeadlessAgent()

        if not agent_cfg.has_key("timeout"):
            agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT

        br.setOptions(agent_cfg)
        cr_job = ag.CrawlJob(br)
        cr_job.urls = [
            urls,
        ] if isinstance(urls, basestring) else urls
        cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls)

        ag.run_crawl(cr_job)

        self.assertTrue(os.path.isdir(cr_job.job_dir),
                        'No job folder created!')
        for idx, url in enumerate(cr_job.urls):
            outfile = os.path.join(
                cr_job.job_dir,
                fu.get_out_filename_from_url(url, str(idx + 1)))
            self.assertTrue(os.path.isfile(outfile),
                            'Cannot find log file %s' % outfile)
            self.assert_all_patterns_in_file(outfile, expected_strs)
            self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
예제 #4
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
예제 #5
0
 def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]):
     # TODO: add support for normal browsers 
     if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']:
         br = ag.ChromeAgent()
     else:  
         br = ag.HeadlessAgent()
     
     if not agent_cfg.has_key("timeout"):
         agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT        
     
     br.setOptions(agent_cfg)
     cr_job = ag.CrawlJob(br)
     cr_job.urls = [urls,] if isinstance(urls, basestring) else urls
     cr_job.url_tuples = zip(xrange(1, len(urls)+1), urls)
     
     ag.run_crawl(cr_job)
     
     self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!')
     for idx, url in  enumerate(cr_job.urls):
         outfile = os.path.join(cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx+1)))  
         self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile)
         self.assert_all_patterns_in_file(outfile, expected_strs)
         self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
예제 #6
0
 def test_get_out_filename_from_url(self):
     url = 'http://youtube.com'
     prefix = '2'
     self.assertEqual(fu.get_out_filename_from_url(url, prefix), '%s-http-youtube-com.txt' % (prefix))
     self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.txt'), '%s-http-youtube-com.txt' % (prefix))
     self.assertEqual(fu.get_out_filename_from_url(url, prefix, '.log'), '%s-http-youtube-com.log' % (prefix))