class SpiderMan(object): def _init_(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_urls,data = self.parser.parser(new_url,html) #将抽取的url添加到URL管理器中 self.output.store_data(data) print "已经抓取%s个链接"%self.manager.old_url_size() except Exception,e: print "crawl failed" #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self): # 添加URL入口 self.manager.add_new_url( "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1" ) self.manager.add_new_url( "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg=2" ) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的URL new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 book_details = self.parser.parser(new_url, html) # 数据存储器存储文件 print(book_details) for book_detail in book_details: self.output.store_book(book_detail) except Exception as e: print("crawl failed") self.output.output_html()
def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100): try: new_url = self.manager.get_new_url() html = HtmlDownloader.download(new_url) data = self.parser.parser(new_url, html) # self.manager.add_new_url(new_urls) self.output.store_data(data) # print '已经抓取%s个链接' % self.manager.get_old_url_size() except Exception, e: print 'craml execption %s' % e
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 300): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlHTML(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理中时候是否有新的url, 同时判断抓取了多少个url while (self.manager.has_new_url()): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 self.parser.parser(new_url, html) except Exception, e: print e print "crawl failed"
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self): self.htmlDownloader = HtmlDownloader() self.dataOutput = DataOutput()
class HtmlParser(object): def __init__(self): self.htmlDownloader = HtmlDownloader() self.dataOutput = DataOutput() def parser(self, page_url, html_cont): ''' 用于解析网页内容, 抽取URL和数据 :param page_url: 下载页面的URL :param html_cont: 下载的网页内容 :return: 返回URL和数据 ''' if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = self._get_new_urls(page_url, soup) return new_urls def _get_new_urls(self, page_url, soup): ''' 抽取新的URL集合 :param page_url: 下载页面的URl :param soup: soup :return: 返回新的URL集合 ''' new_urls = set() #抽取符合要求的a标记 wrapper = soup.select( 'div.exp-gridwall-standard > div.exp-product-wall > div') # print len(wrapper) i = 0 headers = ['title', 'category', 'imgName', 'price'] for child in wrapper: link = child.div.div.div.div.a['href'].encode('utf8') imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8') imgFileName = self.getFileName(imgLink) name = child.div.select('.product-display-name')[0].string.encode( 'utf8') category = child.div.select('.product-subtitle')[0].string.encode( 'utf8') price = child.div.select( '.local.nsg-font-family--base')[0].string.encode('utf8') if name == None: name = '' if category == None: category = '' if imgFileName == None: imgFileName = '' if price == None: price = '0' data = { 'title': name, 'category': category, 'imgName': imgFileName, 'price': price } print('目前下载进度: ' + str(i)) i = i + 1 self.dataOutput.datas.append(data) print data self.dataOutput.output_csv(headers) return new_urls def _get_new_data(self, page_url, soup): ''' 抽取有效数据 :param page_url:下载页面的URL :param soup: :return: 返回有效数据 ''' data = {} data['url'] = unquote(page_url) title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') data['title'] = title.get_text() summary = soup.find('div', class_='lemma-summary') # 获取ta中包含的所有的文本内容, 包括子孙tag中的内容, 并将结合作为Unicode字符串返回 data['summary'] = summary.get_text() return data def parserJSON(self, jsonData): if jsonData is None: return text = json.loads(jsonData) itemsArr = text['sections'][0]['items'] # print text['sections'][0]['items'] i = 0 headers = ['title', 'category', 'imgName', 'price'] for item in itemsArr: title = item['title'].encode('utf8') category = item['subtitle'].encode('utf8') price = item['localPrice'].encode('utf8') link = item['pdpUrl'].encode('utf8') imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8') imgFileName = self.getFileName(imgLink) if title == None: title = '' if category == None: category = '' if imgFileName == None: imgFileName = '' if price == None: price = '0' data = { 'title': title, 'category': category, 'imgName': imgFileName, 'price': price } print('Current Download Status: ' + str(i)) print data i = i + 1 self.dataOutput.datas.append(data) self.dataOutput.output_csv(headers) def getFileName(self, url, file_path='', file_suffix=''): queryArr = url.split('/') strLen = len(queryArr) name = queryArr[strLen - 1] try: if type(int(name[0])) == int: name = 'a' + name except Exception: name = name file_name = re.sub(r'.tif\?(\D)*$', '.jpg', name) return '{}{}{}'.format(file_path, file_name, file_suffix)