def test_gen_url_list(self): # TODO: create a test file, remove Alexa dependency. self.assertEqual(list(wu.gen_url_list(0)), []) self.assertEqual(len(list(wu.gen_url_list(10))), 10, "Cannot read 10 URLs, make sure you've \ the CSV file from Alexa in place") # i.e. ALEXA_TOP1M_PATH
def test_gen_url_list(self): # TODO: create a test file, remove Alexa dependency. self.assertEqual(list(wu.gen_url_list(0)), []) self.assertEqual(len(list(wu.gen_url_list(10))), 10, "Cannot read 10 URLs, make sure you've \ the CSV file from Alexa in place" ) # i.e. ALEXA_TOP1M_PATH
def test_init_headless_agent(self): ha = ag.HeadlessAgent() cr_job = ag.CrawlJob(ha) crawl_agent_cfg = { 'main_js': cm.CASPER_JS_LAZY_HOMEPAGER, 'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS, 'timeout': 20, 'screenshot': True, 'post_visit_func': lp.parse_log_dump_results } ha = ag.HeadlessAgent() ha.setOptions(crawl_agent_cfg) limit = 3 cr_job_cfg = { 'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit, 'max_parallel_procs': 20, 'crawl_agent': ha, 'urls': wu.gen_url_list(limit) } cr_job.setOptions(cr_job_cfg) ag.run_crawl(cr_job) self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
def test_init_headless_agent(self): ha = ag.HeadlessAgent() cr_job = ag.CrawlJob(ha) crawl_agent_cfg = { 'main_js' : cm.CASPER_JS_LAZY_HOMEPAGER, 'cmd_line_options' : ag.PHANTOM_COMMON_OPTIONS, 'timeout' : 20, 'screenshot' : True, 'post_visit_func': lp.parse_log_dump_results } ha = ag.HeadlessAgent() ha.setOptions(crawl_agent_cfg) limit = 3 cr_job_cfg = { 'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit, 'max_parallel_procs': 20, 'crawl_agent': ha, 'urls': wu.gen_url_list(limit) } cr_job.setOptions(cr_job_cfg) ag.run_crawl(cr_job) self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
if args and args[0] == '--url_file': url_file = args[1] del args[0:2] if args and args[0] == '--stop': stop = int(args[1]) del args[0:2] if args and args[0] == '--start': start = int(args[1]) del args[0:2] if args and args[0] == '--type': crawler_type = args[1] del args[0:2] if args and args[0] == '--max_proc': max_parallel_procs = int(args[1]) del args[0:2] if args: print 'some arguments are not processed, check your command: %s' % (args) sys.exit(1) if not stop or not crawler_type: print 'Cannot get the arguments for stoplimit %s or crawler_type%s' % (stop, crawler_type) sys.exit(1) url_tuples = gen_url_list(stop, start, True, url_file) crawl_sites(url_tuples, crawler_type, 1+stop-start, max_parallel_procs)
del args[0:2] if args and args[0] == '--stop': stop = int(args[1]) del args[0:2] if args and args[0] == '--start': start = int(args[1]) del args[0:2] if args and args[0] == '--type': crawler_type = args[1] del args[0:2] if args and args[0] == '--max_proc': max_parallel_procs = int(args[1]) del args[0:2] if args: print 'some arguments are not processed, check your command: %s' % ( args) sys.exit(1) if not stop or not crawler_type: print 'Cannot get the arguments for stoplimit %s or crawler_type%s' % ( stop, crawler_type) sys.exit(1) url_tuples = gen_url_list(stop, start, True, url_file) crawl_sites(url_tuples, crawler_type, 1 + stop - start, max_parallel_procs)
def test_get_top_alexa_list_start_stop(self): top_50_100 = list(wu.gen_url_list(100, 50)) self.assertEqual(len(top_50_100), 51) top_5_10 = list(wu.gen_url_list(10, 5)) self.assertEqual(len(top_5_10), 6)