def spider(url_list): fetcher = Fetch('/tmp') spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): headers = { } dongxi = Dongxi() fetcher = NoCacheFetch(0,headers=headers) spider = Rolling(fetcher,dongxi.daily_dongxi()) spider_runner = GSpider(spider, workers_count=3) spider_runner.start()
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 3.33, zhihu_topic_title) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7', 'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language':'zh-cn,zh;q=0.5', 'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7', 'Content-type':'application/x-www-form-urlencoded' } fetcher = NoCacheFetch(0, headers=headers) spider = Rolling( fetcher, pagelister() ) spider_runner = GSpider(spider, workers_count=10) spider_runner.start()
def spider(url_list): fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE ) ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=2, debug=debug) spider_runner.start()
def main(): url_list = [] for url, user_id, zsite_id in DOUBAN_SITE_LIST: url_list.append((ParseEventIndex(user_id, zsite_id), 'http://site.douban.com/widget/events/%s/'%url)) #self.url, self.user_id, self.zsite_id = url, user_id, zsite_id headers = { 'Cookie':'bid=i9gsK/lU40A', } fetcher = NoCacheFetch(10, headers=headers) spider = Rolling( fetcher, url_list ) spider_runner = GSpider(spider, workers_count=1) spider_runner.start()
def spider(url_list): fetcher = NoCacheFetch( 0, {"Cookie": COOKIE} #'/home/zuroc/tmp', #tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE), #1, ) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def spider(url_list): fetcher = Fetch( '/home/zuroc/tmp', tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE), 25, zhihu_topic_title ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): url_list = [] for url, user_id, zsite_id in DOUBAN_SITE_LIST: url_list.append((ParseEventIndex(user_id, zsite_id), 'http://site.douban.com/widget/events/%s/' % url)) #self.url, self.user_id, self.zsite_id = url, user_id, zsite_id headers = { 'Cookie': 'bid=i9gsK/lU40A', } fetcher = NoCacheFetch(10, headers=headers) spider = Rolling(fetcher, url_list) spider_runner = GSpider(spider, workers_count=1) spider_runner.start()
def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7', 'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'zh-cn,zh;q=0.5', 'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.7', 'Content-type': 'application/x-www-form-urlencoded' } fetcher = NoCacheFetch(0, headers=headers) spider = Rolling(fetcher, pagelister()) spider_runner = GSpider(spider, workers_count=10) spider_runner.start()
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch( '/tmp', tuple( { 'Cookie': i } for i in COOKIE), 3.33, zhihu_topic_title ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start()
def main(): headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Language':'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.zhihu.com', 'Referer:http':'//www.zhihu.com/', 'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11', } new_yeeyan = Yeeyan() fetcher = NoCacheFetch(0, headers=headers) spider = Rolling( fetcher, new_yeeyan.yeeyan_daily()) spider_runner = GSpider(spider, workers_count=100) spider_runner.start()
def spider(url_list): fetcher = NoCacheFetch( 0, { 'Cookie': COOKIE } #'/home/zuroc/tmp', #tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE), #1, ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=10, debug=debug) spider_runner.start()
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 2.6, zhihu_topic_title) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() global QUESTION_ID_SET QUESTION_ID_SET = tuple(QUESTION_ID_SET) with open("question_id.py", "w") as question: question.write("QUESTION_ID_SET = ") question.write(pformat(QUESTION_ID_SET))
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch( '/tmp', tuple( { 'Cookie': i } for i in COOKIE), 2.6, zhihu_topic_title ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() global QUESTION_ID_SET QUESTION_ID_SET = tuple(QUESTION_ID_SET) with open("question_id.py","w") as question: question.write("QUESTION_ID_SET = ") question.write(pformat(QUESTION_ID_SET))