示例#1
0
 def test_gen_url_list(self):
     # TODO: create a test file, remove Alexa dependency.
     
     self.assertEqual(list(wu.gen_url_list(0)), [])
     self.assertEqual(len(list(wu.gen_url_list(10))), 10, 
                      "Cannot read 10 URLs, make sure you've \
                      the CSV file from Alexa in place") # i.e. ALEXA_TOP1M_PATH
    def test_gen_url_list(self):
        # TODO: create a test file, remove Alexa dependency.

        self.assertEqual(list(wu.gen_url_list(0)), [])
        self.assertEqual(len(list(wu.gen_url_list(10))), 10,
                         "Cannot read 10 URLs, make sure you've \
                         the CSV file from Alexa in place"
                         )  # i.e. ALEXA_TOP1M_PATH
示例#3
0
    def test_init_headless_agent(self):
        ha = ag.HeadlessAgent()
        cr_job = ag.CrawlJob(ha)

        crawl_agent_cfg = {
            'main_js': cm.CASPER_JS_LAZY_HOMEPAGER,
            'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS,
            'timeout': 20,
            'screenshot': True,
            'post_visit_func': lp.parse_log_dump_results
        }

        ha = ag.HeadlessAgent()
        ha.setOptions(crawl_agent_cfg)
        limit = 3
        cr_job_cfg = {
            'desc':
            'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.'
            % limit,
            'max_parallel_procs':
            20,
            'crawl_agent':
            ha,
            'urls':
            wu.gen_url_list(limit)
        }

        cr_job.setOptions(cr_job_cfg)

        ag.run_crawl(cr_job)
        self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
示例#4
0
 def test_init_headless_agent(self):
     ha = ag.HeadlessAgent()
     cr_job = ag.CrawlJob(ha)
 
     crawl_agent_cfg = {
                'main_js' : cm.CASPER_JS_LAZY_HOMEPAGER,
                'cmd_line_options' : ag.PHANTOM_COMMON_OPTIONS,
                'timeout' : 20,
                'screenshot' : True,
                'post_visit_func': lp.parse_log_dump_results
                }
     
     ha = ag.HeadlessAgent()
     ha.setOptions(crawl_agent_cfg)
     limit = 3
     cr_job_cfg = {
               'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit,
               'max_parallel_procs': 20,
               'crawl_agent': ha,
               'urls':  wu.gen_url_list(limit)
               }
     
     cr_job.setOptions(cr_job_cfg)
     
     ag.run_crawl(cr_job)
     self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
示例#5
0
    if args and args[0] == '--url_file':
        url_file = args[1]        
        del args[0:2]
    
    if args and args[0] == '--stop':
        stop = int(args[1])        
        del args[0:2]
    
    if args and args[0] == '--start':
        start = int(args[1])        
        del args[0:2]
    
    if args and args[0] == '--type':        
        crawler_type = args[1]
        del args[0:2]
    
    if args and args[0] == '--max_proc':
        max_parallel_procs = int(args[1])        
        del args[0:2]
    
    if args:
        print 'some arguments are not processed, check your command: %s' % (args)
        sys.exit(1)
    
    if not stop or not crawler_type:
        print 'Cannot get the arguments for stoplimit %s or crawler_type%s' % (stop, crawler_type)
        sys.exit(1)
        
    url_tuples = gen_url_list(stop, start, True, url_file)
    crawl_sites(url_tuples, crawler_type, 1+stop-start, max_parallel_procs)
示例#6
0
        del args[0:2]

    if args and args[0] == '--stop':
        stop = int(args[1])
        del args[0:2]

    if args and args[0] == '--start':
        start = int(args[1])
        del args[0:2]

    if args and args[0] == '--type':
        crawler_type = args[1]
        del args[0:2]

    if args and args[0] == '--max_proc':
        max_parallel_procs = int(args[1])
        del args[0:2]

    if args:
        print 'some arguments are not processed, check your command: %s' % (
            args)
        sys.exit(1)

    if not stop or not crawler_type:
        print 'Cannot get the arguments for stoplimit %s or crawler_type%s' % (
            stop, crawler_type)
        sys.exit(1)

    url_tuples = gen_url_list(stop, start, True, url_file)
    crawl_sites(url_tuples, crawler_type, 1 + stop - start, max_parallel_procs)
    def test_get_top_alexa_list_start_stop(self):
        top_50_100 = list(wu.gen_url_list(100, 50))
        self.assertEqual(len(top_50_100), 51)

        top_5_10 = list(wu.gen_url_list(10, 5))
        self.assertEqual(len(top_5_10), 6)
示例#8
0
 def test_get_top_alexa_list_start_stop(self):
     top_50_100 = list(wu.gen_url_list(100, 50))
     self.assertEqual(len(top_50_100), 51)
     
     top_5_10 = list(wu.gen_url_list(10, 5))
     self.assertEqual(len(top_5_10), 6)