Пример #1
0
def spider(url_list):
    #    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 2.6,
                    zhihu_topic_title)
    spider = Rolling(fetcher, url_list)

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()

    global QUESTION_ID_SET
    QUESTION_ID_SET = tuple(QUESTION_ID_SET)
    with open("question_id.py", "w") as question:
        question.write("QUESTION_ID_SET = ")
        question.write(pformat(QUESTION_ID_SET))
Пример #2
0
def spider(url_list):
#    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch(
        '/tmp',
        tuple( { 'Cookie': i } for i in COOKIE),
        2.6,
        zhihu_topic_title
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()

    global QUESTION_ID_SET
    QUESTION_ID_SET = tuple(QUESTION_ID_SET)
    with open("question_id.py","w") as question:
        question.write("QUESTION_ID_SET = ")
        question.write(pformat(QUESTION_ID_SET))
Пример #3
0
    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()

from operator import itemgetter

if __name__ == '__main__':

#    print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]'


    url_list = []
    for t in ZHIHU_TOPIC:
        i = t[0]
        url_list.append((parse_topic, URL_TEMPLATE%i, i))

    spider(url_list)

    with open('zhihu_topic_related_data.py', 'w') as topic:
        topic.write('#coding:utf-8\n')
        topic.write('TOPIC_RELATED = ')
        topic.write(pformat(RESULT))
        topic.write('TOPIC = ')
        topic.write(pformat(TOPIC))



Пример #4
0
from operator import itemgetter

if __name__ == '__main__':

#    print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]'
    for i in chariter():
        print i
        raw_input()

    raise
    url_list = []
    for i in set(chariter()):
        url_list.append((parse_topic, URL_TEMPLATE%quote(i)))
    spider(url_list)
    RESULT = RESULT.items()
    RESULT.sort(key=itemgetter(0))
    result = []
    for i in RESULT:
        result.append([i[0]])
        result[-1].extend(i[1])
        result[-1][-1] = list(result[-1][-1])

    with open('zhihu_topic_data.py', 'w') as topic:
        topic.write('#coding:utf-8\n')
        topic.write('ZHIHU_TOPIC = ')
        topic.write(pformat(result))



Пример #5
0
    spider_runner = GSpider(spider, debug=debug)
    spider_runner.start()


from operator import itemgetter

if __name__ == '__main__':

    #    print u'["topic", "\u767e\u5ea6", "\u767e\u5ea6", "http://p1.zhimg.com//e7/5e/e75e39ed2_s.jpg", 413, "5854", "baidu"]'
    for i in chariter():
        print i
        raw_input()

    raise
    url_list = []
    for i in set(chariter()):
        url_list.append((parse_topic, URL_TEMPLATE % quote(i)))
    spider(url_list)
    RESULT = RESULT.items()
    RESULT.sort(key=itemgetter(0))
    result = []
    for i in RESULT:
        result.append([i[0]])
        result[-1].extend(i[1])
        result[-1][-1] = list(result[-1][-1])

    with open('zhihu_topic_data.py', 'w') as topic:
        topic.write('#coding:utf-8\n')
        topic.write('ZHIHU_TOPIC = ')
        topic.write(pformat(result))