def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 2.6, zhihu_topic_title) spider = Rolling(fetcher, url_list) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() global QUESTION_ID_SET QUESTION_ID_SET = tuple(QUESTION_ID_SET) with open("question_id.py", "w") as question: question.write("QUESTION_ID_SET = ") question.write(pformat(QUESTION_ID_SET))
def spider(url_list): # fetcher = MultiHeadersFetch( headers=tuple( { 'Cookie': i } for i in COOKIE)) fetcher = Fetch( '/tmp', tuple( { 'Cookie': i } for i in COOKIE), 2.6, zhihu_topic_title ) spider = Rolling( fetcher, url_list ) debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() global QUESTION_ID_SET QUESTION_ID_SET = tuple(QUESTION_ID_SET) with open("question_id.py","w") as question: question.write("QUESTION_ID_SET = ") question.write(pformat(QUESTION_ID_SET))
debug = False debug = True spider_runner = GSpider(spider, workers_count=1, debug=debug) spider_runner.start() from operator import itemgetter if __name__ == '__main__': # print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]' url_list = [] for t in ZHIHU_TOPIC: i = t[0] url_list.append((parse_topic, URL_TEMPLATE%i, i)) spider(url_list) with open('zhihu_topic_related_data.py', 'w') as topic: topic.write('#coding:utf-8\n') topic.write('TOPIC_RELATED = ') topic.write(pformat(RESULT)) topic.write('TOPIC = ') topic.write(pformat(TOPIC))
from operator import itemgetter if __name__ == '__main__': # print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]' for i in chariter(): print i raw_input() raise url_list = [] for i in set(chariter()): url_list.append((parse_topic, URL_TEMPLATE%quote(i))) spider(url_list) RESULT = RESULT.items() RESULT.sort(key=itemgetter(0)) result = [] for i in RESULT: result.append([i[0]]) result[-1].extend(i[1]) result[-1][-1] = list(result[-1][-1]) with open('zhihu_topic_data.py', 'w') as topic: topic.write('#coding:utf-8\n') topic.write('ZHIHU_TOPIC = ') topic.write(pformat(result))
spider_runner = GSpider(spider, debug=debug) spider_runner.start() from operator import itemgetter if __name__ == '__main__': # print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]' for i in chariter(): print i raw_input() raise url_list = [] for i in set(chariter()): url_list.append((parse_topic, URL_TEMPLATE % quote(i))) spider(url_list) RESULT = RESULT.items() RESULT.sort(key=itemgetter(0)) result = [] for i in RESULT: result.append([i[0]]) result[-1].extend(i[1]) result[-1][-1] = list(result[-1][-1]) with open('zhihu_topic_data.py', 'w') as topic: topic.write('#coding:utf-8\n') topic.write('ZHIHU_TOPIC = ') topic.write(pformat(result))