예제 #1
0
def startDown(url, rule, num, start, decoding=None):
    if not decoding:
        decoding = 'utf8'
    #req=urllib.request.urlopen(url)
    #response= request.get(url, 3)
    #body=response.text #req.read().decode(decoding)

    f = open(url)
    body = f.read()
    f.close()
    debs = body.split('\n')

    rule = re.compile(rule)
    #debs=rule.findall(body)
    crawl_queue = MogoQueue('cetc15-apt', 'crawl_queue')
    crawl_queue.clear()  #
    for l in debs:
        l = l.strip()
        if (len(l) == 0 or not l.startswith(PREFIX_STR)):
            continue
        print 'deb:' + l[start:]
        crawl_queue.push(l[start:], 'a')
    for i in range(num):
        d = download(crawl_queue)
        d.start()
예제 #2
0
def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')
    title = Soup.find('div', class_='gm').find('h1', id='gj').get_text()
    spider_queue = MogoQueue('meinv', 'img_queue')
    spider_queue.clear()
    print(u'清除集合img_queue')
    spider_queue = MogoQueue('meinv', 'img_queue')
    print(u'新建集合img_queue')
    max_span = Soup.find('table', class_='ptt').find_all('td')[-2].get_text()
    page_url = url
    for i in range(1,int(max_span)+1):
        html = request.get(page_url, 3)
        Soup = BeautifulSoup(html.text, 'lxml')
        all_a = Soup.find('div', id='gdt').find_all('a')
        for a in all_a:
            href = a['href']
            name = a.img['alt']
            spider_queue.push(href, title,name)
        page_url =url+'?p='+str(i)