Exemplo n.º 1
0
def spider(url_list):
    fetcher = Fetch('/tmp')
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()
Exemplo n.º 2
0
def main():
    headers = {
    }

    dongxi = Dongxi()
    fetcher = NoCacheFetch(0,headers=headers)
    spider = Rolling(fetcher,dongxi.daily_dongxi())
    spider_runner = GSpider(spider, workers_count=3)
    spider_runner.start()
Exemplo n.º 3
0
def spider(url_list):
    #    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 3.33,
                    zhihu_topic_title)
    spider = Rolling(fetcher, url_list)

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()
Exemplo n.º 4
0
def main():
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
            'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language':'zh-cn,zh;q=0.5',
            'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7',
            'Content-type':'application/x-www-form-urlencoded'
    }
    fetcher = NoCacheFetch(0, headers=headers)
    spider = Rolling( fetcher, pagelister() )
    spider_runner = GSpider(spider, workers_count=10)
    spider_runner.start()
Exemplo n.º 5
0
def spider(url_list):
    fetcher = MultiHeadersFetch( 
        headers=tuple(
            { 'Cookie': i } for i in COOKIE
        )
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=2, debug=debug)
    spider_runner.start()
def main():
    url_list = []
    for url, user_id, zsite_id in DOUBAN_SITE_LIST:
        url_list.append((ParseEventIndex(user_id, zsite_id), 'http://site.douban.com/widget/events/%s/'%url))

    #self.url, self.user_id, self.zsite_id = url, user_id, zsite_id
    headers = {
        'Cookie':'bid=i9gsK/lU40A',
    }
    fetcher = NoCacheFetch(10, headers=headers)
    spider = Rolling( fetcher, url_list )
    spider_runner = GSpider(spider, workers_count=1)
    spider_runner.start()
Exemplo n.º 7
0
def spider(url_list):
    fetcher = NoCacheFetch(
        0, {"Cookie": COOKIE}
        #'/home/zuroc/tmp',
        #tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE),
        #1,
    )
    spider = Rolling(fetcher, url_list)

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()
Exemplo n.º 8
0
def spider(url_list):
    fetcher = Fetch(
        '/home/zuroc/tmp',
        tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE),
        25,
        zhihu_topic_title
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()
Exemplo n.º 9
0
def main():
    url_list = []
    for url, user_id, zsite_id in DOUBAN_SITE_LIST:
        url_list.append((ParseEventIndex(user_id, zsite_id),
                         'http://site.douban.com/widget/events/%s/' % url))

    #self.url, self.user_id, self.zsite_id = url, user_id, zsite_id
    headers = {
        'Cookie': 'bid=i9gsK/lU40A',
    }
    fetcher = NoCacheFetch(10, headers=headers)
    spider = Rolling(fetcher, url_list)
    spider_runner = GSpider(spider, workers_count=1)
    spider_runner.start()
Exemplo n.º 10
0
def main():
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
        'Accept':
        ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
        'Accept-Language': 'zh-cn,zh;q=0.5',
        'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.7',
        'Content-type': 'application/x-www-form-urlencoded'
    }
    fetcher = NoCacheFetch(0, headers=headers)
    spider = Rolling(fetcher, pagelister())
    spider_runner = GSpider(spider, workers_count=10)
    spider_runner.start()
Exemplo n.º 11
0
def spider(url_list):
#    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch(
        '/tmp',
        tuple( { 'Cookie': i } for i in COOKIE),
        3.33,
        zhihu_topic_title
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()
Exemplo n.º 12
0
def main():
    headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Language':'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive',
            'Host':'www.zhihu.com',
            'Referer:http':'//www.zhihu.com/',
            'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    }
    new_yeeyan = Yeeyan()
    fetcher = NoCacheFetch(0, headers=headers)
    spider = Rolling( fetcher, new_yeeyan.yeeyan_daily())
    spider_runner = GSpider(spider, workers_count=100)
    spider_runner.start()
Exemplo n.º 13
0
def spider(url_list):
    fetcher = NoCacheFetch(
        0,
        {
            'Cookie': COOKIE
        }
        #'/home/zuroc/tmp',
        #tuple( { 'Cookie': i.replace('Cookie:','').strip() } for i in COOKIE),
        #1,
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=10, debug=debug)
    spider_runner.start()
Exemplo n.º 14
0
def spider(url_list):
    #    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch('/tmp', tuple({'Cookie': i} for i in COOKIE), 2.6,
                    zhihu_topic_title)
    spider = Rolling(fetcher, url_list)

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()

    global QUESTION_ID_SET
    QUESTION_ID_SET = tuple(QUESTION_ID_SET)
    with open("question_id.py", "w") as question:
        question.write("QUESTION_ID_SET = ")
        question.write(pformat(QUESTION_ID_SET))
Exemplo n.º 15
0
def spider(url_list):
#    fetcher = MultiHeadersFetch(  headers=tuple( { 'Cookie': i } for i in COOKIE))
    fetcher = Fetch(
        '/tmp',
        tuple( { 'Cookie': i } for i in COOKIE),
        2.6,
        zhihu_topic_title
    )
    spider = Rolling( fetcher, url_list )

    debug = False
    debug = True

    spider_runner = GSpider(spider, workers_count=1, debug=debug)
    spider_runner.start()

    global QUESTION_ID_SET
    QUESTION_ID_SET = tuple(QUESTION_ID_SET)
    with open("question_id.py","w") as question:
        question.write("QUESTION_ID_SET = ")
        question.write(pformat(QUESTION_ID_SET))