def startDown(url, rule, num, start, decoding=None): if not decoding: decoding = 'utf8' #req=urllib.request.urlopen(url) #response= request.get(url, 3) #body=response.text #req.read().decode(decoding) f = open(url) body = f.read() f.close() debs = body.split('\n') rule = re.compile(rule) #debs=rule.findall(body) crawl_queue = MogoQueue('cetc15-apt', 'crawl_queue') crawl_queue.clear() # for l in debs: l = l.strip() if (len(l) == 0 or not l.startswith(PREFIX_STR)): continue print 'deb:' + l[start:] crawl_queue.push(l[start:], 'a') for i in range(num): d = download(crawl_queue) d.start()
def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') title = Soup.find('div', class_='gm').find('h1', id='gj').get_text() spider_queue = MogoQueue('meinv', 'img_queue') spider_queue.clear() print(u'清除集合img_queue') spider_queue = MogoQueue('meinv', 'img_queue') print(u'新建集合img_queue') max_span = Soup.find('table', class_='ptt').find_all('td')[-2].get_text() page_url = url for i in range(1,int(max_span)+1): html = request.get(page_url, 3) Soup = BeautifulSoup(html.text, 'lxml') all_a = Soup.find('div', id='gdt').find_all('a') for a in all_a: href = a['href'] name = a.img['alt'] spider_queue.push(href, title,name) page_url =url+'?p='+str(i)