Пример #1
0
def main():

    start_urls = []
    for k, v in SiteUrl.items():
        if k != 'US':
            continue
        url = 'http://%s/sch/allcategories/all-categories' % v
        start_urls.append((url, {'site': k}))

    pipeline = LevelDBPipeline('ebay.ldb', '127.0.0.1', 11222, start_urls)
    spider = Spider(pipeline, max_running=1000000)

    spider.run()
Пример #2
0
#! /usr/bin/env python
#coding=utf-8
from site_config import SiteUrl
import urllib2
import re

id_re = re.compile(r'/(\d+)/i\.html', re.I | re.M)

f = file('categorys.txt', 'w')
for k, v in SiteUrl.items():
    url = 'http://%s/allcategories/all-categories' % v
    data = urllib2.urlopen(url).read()
    id_list = id_re.findall(data)
    f.write('\n'.join('%s\t%s' % (k, x) for x in id_list))
    f.write('\n')
    print k, 'finish'