def test_spider_distributed(): """ test distributed spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_wait="spider.wait", key_all="spider.all") # add start url web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider web_spider_dist.start_work_and_wait_done(fetcher_num=10) return
def test_spider_distributed(): """ test distributed spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=1, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, proxieser=None, url_filter=url_filter, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_high_priority="spider.high", key_low_priority="spider.low") # start web_spider web_spider_dist.start_work_and_wait_done(fetcher_num=10) return
def test_spider_distributed(): """ test distributed spider """ # initial fetcher / parser / saver fetcher = MyFetcher(max_repeat=1, sleep_time=0) parser = MyParser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_distributed.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, proxieser=None, url_filter=url_filter, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_high_priority="spider.high", key_low_priority="spider.low") # start web_spider web_spider_dist.start_working(fetcher_num=10) # wait for finished web_spider_dist.wait_for_finished(is_over=True) return
def test_spider_distributed(): """ test distributed spider """ # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # 初始化WebSpiderDist web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_wait="spider.wait", key_all="spider.all") # 添加种子Url web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider_dist.start_work_and_wait_done(fetcher_num=1) return