示例#1
0
class SpiderMan(object):
	def _init_(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
	def crawl(self,root_url):
		#添加入口URL
		self.manager.add_new_url(root_url)
		#判断url管理器中是否有新的url,同时判断抓取了多少个url
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				#从URL管理器获取新的url
				new_url = self.manager.get_new_url()
				#HTML下载器下载网页
				html = self.downloader.download(new_url)
				#HTML解析器抽取网页数据
				new_urls,data = self.parser.parser(new_url,html)
				#将抽取的url添加到URL管理器中
				self.output.store_data(data)
				print "已经抓取%s个链接"%self.manager.old_url_size()
			except Exception,e:
				print "crawl failed"
				#数据存储器将文件输出成指定格式
		self.output.output_html()
示例#2
0
class SpiderMan(object):
    #进行类的初始化
  def __init__(self):
    self.manager = UrlManager()
    self.downloader = HtmlDownLoader()
    self.parser = HtmlParser()
    self.output = DataOutput()
  def crawl(self,root_url):
    #添加入口url
    self.manager.add_new_url(root_url)
    
    #判断url管理器中是否有新的url,同时判断抓取了多少个url
    while (self.manager.has_new_url() and self.manager.old_url_size() < 20):
      time.sleep(1)
      try:
        #从url管理器中获取新的url
        new_url = self.manager.get_new_url()
        #HTML 下载器下载页面
        html = self.downloader.download(new_url)
        #HTML 解析器抽取页面数据
        new_urls,data = self.parser.parser(new_url,html)
        #将抽取的url 添加到url管理器中
        self.manager.add_new_urls(new_urls)
        #数据存储器存储文件
        self.output.store_data(data)
        print '已经抓取%s个链接' % self.manager.old_url_size()
      except Exception,e:  #Exception	常规错误的基类
        print 'crawl failed'
    #数据存储器将文件输出成指定格式
    self.output.output_html()
示例#3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self):
        # 添加URL入口
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1"
        )
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg=2"
        )

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                book_details = self.parser.parser(new_url, html)
                # 数据存储器存储文件
                print(book_details)
                for book_detail in book_details:
                    self.output.store_book(book_detail)
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
示例#4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.output = DataOutput()
        self.downloader = HtmlDownloader
        self.parser = HtmlParser()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = HtmlDownloader.download(new_url)
                data = self.parser.parser(new_url, html)
                # self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                # print '已经抓取%s个链接' % self.manager.get_old_url_size()
            except Exception, e:
                print 'craml execption %s' % e
        self.output.output_html()
class SpiderMan(object):
	def __init__(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
		
	def crawl(self, root_url):
		#调用函数
		self.manager.add_new_url(root_url)
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				new_url = self.manager.get_new_url()
				html = self.downloader.download(new_url)
				new_urls, data = self.parser.parser(new_url, html)
				self.manager.add_new_urls(new_urls)
				self.output.store_data(data)
				print("已经抓取%s个链接"%self.manager.old_url_size())
			except:
				print("crawl failed")
		self.output.ouput_html()
示例#6
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_url_to_undo_urls(root_url)
        while (self.manager.is_or_not_new_url()
               and self.manager.done_urls_size() < 100):
            try:
                new_url = self.manager.get_undo_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                # print(new_urls)
                self.manager.add_urls_to_undo_urls(new_urls)
                self.output.store_data(data)
                print("已抓取%s个链接" % self.manager.done_urls_size())
            except Exception as e:
                print("crawl failed!")
        self.output.output_html()
	def store_proc(self, store_q):
		output = DataOutput()
		while True:
			if not store_q.empty():
				data= store_q.get()
				if data == 'end':
					print('储存进度接受通知然后结束')
					output.output_end(output.filepath())
					
					return
				output.store_data(data)
			else:
				time.sleep(0.1)
示例#8
0
 def __init__(self):
   self.manager = UrlManager()
   self.downloader = HtmlDownLoader()
   self.parser = HtmlParser()
   self.output = DataOutput()
示例#9
0
 def __init__(self):
     self.htmlDownloader = HtmlDownloader()
     self.dataOutput = DataOutput()
示例#10
0
class HtmlParser(object):
    def __init__(self):
        self.htmlDownloader = HtmlDownloader()
        self.dataOutput = DataOutput()

    def parser(self, page_url, html_cont):
        '''
        用于解析网页内容, 抽取URL和数据
        :param page_url: 下载页面的URL
        :param html_cont:  下载的网页内容
        :return: 返回URL和数据
        '''

        if page_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        return new_urls

    def _get_new_urls(self, page_url, soup):
        '''
        抽取新的URL集合
        :param page_url: 下载页面的URl
        :param soup: soup
        :return: 返回新的URL集合
        '''

        new_urls = set()
        #抽取符合要求的a标记
        wrapper = soup.select(
            'div.exp-gridwall-standard > div.exp-product-wall > div')
        # print len(wrapper)
        i = 0
        headers = ['title', 'category', 'imgName', 'price']
        for child in wrapper:
            link = child.div.div.div.div.a['href'].encode('utf8')
            imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8')
            imgFileName = self.getFileName(imgLink)
            name = child.div.select('.product-display-name')[0].string.encode(
                'utf8')
            category = child.div.select('.product-subtitle')[0].string.encode(
                'utf8')
            price = child.div.select(
                '.local.nsg-font-family--base')[0].string.encode('utf8')
            if name == None:
                name = ''
            if category == None:
                category = ''
            if imgFileName == None:
                imgFileName = ''
            if price == None:
                price = '0'
            data = {
                'title': name,
                'category': category,
                'imgName': imgFileName,
                'price': price
            }
            print('目前下载进度: ' + str(i))
            i = i + 1
            self.dataOutput.datas.append(data)
            print data
        self.dataOutput.output_csv(headers)
        return new_urls

    def _get_new_data(self, page_url, soup):
        '''
        抽取有效数据
        :param page_url:下载页面的URL
        :param soup:
        :return: 返回有效数据
        '''
        data = {}
        data['url'] = unquote(page_url)
        title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
        data['title'] = title.get_text()
        summary = soup.find('div', class_='lemma-summary')

        # 获取ta中包含的所有的文本内容, 包括子孙tag中的内容, 并将结合作为Unicode字符串返回
        data['summary'] = summary.get_text()

        return data

    def parserJSON(self, jsonData):
        if jsonData is None:
            return
        text = json.loads(jsonData)
        itemsArr = text['sections'][0]['items']
        # print text['sections'][0]['items']
        i = 0
        headers = ['title', 'category', 'imgName', 'price']
        for item in itemsArr:
            title = item['title'].encode('utf8')
            category = item['subtitle'].encode('utf8')
            price = item['localPrice'].encode('utf8')
            link = item['pdpUrl'].encode('utf8')
            imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8')
            imgFileName = self.getFileName(imgLink)
            if title == None:
                title = ''
            if category == None:
                category = ''
            if imgFileName == None:
                imgFileName = ''
            if price == None:
                price = '0'
            data = {
                'title': title,
                'category': category,
                'imgName': imgFileName,
                'price': price
            }
            print('Current Download Status: ' + str(i))
            print data
            i = i + 1
            self.dataOutput.datas.append(data)
        self.dataOutput.output_csv(headers)

    def getFileName(self, url, file_path='', file_suffix=''):
        queryArr = url.split('/')
        strLen = len(queryArr)
        name = queryArr[strLen - 1]
        try:
            if type(int(name[0])) == int:
                name = 'a' + name
        except Exception:
            name = name
        file_name = re.sub(r'.tif\?(\D)*$', '.jpg', name)
        return '{}{}{}'.format(file_path, file_name, file_suffix)