Пример #1
0
class SpiderMan(object):
	def _init_(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
	def crawl(self,root_url):
		#添加入口URL
		self.manager.add_new_url(root_url)
		#判断url管理器中是否有新的url,同时判断抓取了多少个url
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				#从URL管理器获取新的url
				new_url = self.manager.get_new_url()
				#HTML下载器下载网页
				html = self.downloader.download(new_url)
				#HTML解析器抽取网页数据
				new_urls,data = self.parser.parser(new_url,html)
				#将抽取的url添加到URL管理器中
				self.output.store_data(data)
				print "已经抓取%s个链接"%self.manager.old_url_size()
			except Exception,e:
				print "crawl failed"
				#数据存储器将文件输出成指定格式
		self.output.output_html()
Пример #2
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self):
        # 添加URL入口
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1"
        )
        self.manager.add_new_url(
            "https://www.amazon.cn/gp/bestsellers/books/ref=zg_bs_pg_2?ie=UTF8&pg=2"
        )

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的URL
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                book_details = self.parser.parser(new_url, html)
                # 数据存储器存储文件
                print(book_details)
                for book_detail in book_details:
                    self.output.store_book(book_detail)
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Пример #3
0
 def crawl(self, root_url):
     self.manager.add_new_url(root_url)
     while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100):
         try:
             new_url = self.manager.get_new_url()
             html = HtmlDownloader.download(new_url)
             data = self.parser.parser(new_url, html)
             # self.manager.add_new_url(new_urls)
             self.output.store_data(data)
             # print '已经抓取%s个链接' % self.manager.get_old_url_size()
         except Exception, e:
             print 'craml execption %s' % e
Пример #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 300):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")

        self.output.output_html()
Пример #5
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlHTML(self, root_url):

        # 添加入口URL
        self.manager.add_new_url(root_url)

        # 判断url管理中时候是否有新的url, 同时判断抓取了多少个url
        while (self.manager.has_new_url()):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                self.parser.parser(new_url, html)
            except Exception, e:
                print e
                print "crawl failed"
Пример #6
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Пример #7
0
 def __init__(self):
     self.htmlDownloader = HtmlDownloader()
     self.dataOutput = DataOutput()
Пример #8
0
class HtmlParser(object):
    def __init__(self):
        self.htmlDownloader = HtmlDownloader()
        self.dataOutput = DataOutput()

    def parser(self, page_url, html_cont):
        '''
        用于解析网页内容, 抽取URL和数据
        :param page_url: 下载页面的URL
        :param html_cont:  下载的网页内容
        :return: 返回URL和数据
        '''

        if page_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        return new_urls

    def _get_new_urls(self, page_url, soup):
        '''
        抽取新的URL集合
        :param page_url: 下载页面的URl
        :param soup: soup
        :return: 返回新的URL集合
        '''

        new_urls = set()
        #抽取符合要求的a标记
        wrapper = soup.select(
            'div.exp-gridwall-standard > div.exp-product-wall > div')
        # print len(wrapper)
        i = 0
        headers = ['title', 'category', 'imgName', 'price']
        for child in wrapper:
            link = child.div.div.div.div.a['href'].encode('utf8')
            imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8')
            imgFileName = self.getFileName(imgLink)
            name = child.div.select('.product-display-name')[0].string.encode(
                'utf8')
            category = child.div.select('.product-subtitle')[0].string.encode(
                'utf8')
            price = child.div.select(
                '.local.nsg-font-family--base')[0].string.encode('utf8')
            if name == None:
                name = ''
            if category == None:
                category = ''
            if imgFileName == None:
                imgFileName = ''
            if price == None:
                price = '0'
            data = {
                'title': name,
                'category': category,
                'imgName': imgFileName,
                'price': price
            }
            print('目前下载进度: ' + str(i))
            i = i + 1
            self.dataOutput.datas.append(data)
            print data
        self.dataOutput.output_csv(headers)
        return new_urls

    def _get_new_data(self, page_url, soup):
        '''
        抽取有效数据
        :param page_url:下载页面的URL
        :param soup:
        :return: 返回有效数据
        '''
        data = {}
        data['url'] = unquote(page_url)
        title = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
        data['title'] = title.get_text()
        summary = soup.find('div', class_='lemma-summary')

        # 获取ta中包含的所有的文本内容, 包括子孙tag中的内容, 并将结合作为Unicode字符串返回
        data['summary'] = summary.get_text()

        return data

    def parserJSON(self, jsonData):
        if jsonData is None:
            return
        text = json.loads(jsonData)
        itemsArr = text['sections'][0]['items']
        # print text['sections'][0]['items']
        i = 0
        headers = ['title', 'category', 'imgName', 'price']
        for item in itemsArr:
            title = item['title'].encode('utf8')
            category = item['subtitle'].encode('utf8')
            price = item['localPrice'].encode('utf8')
            link = item['pdpUrl'].encode('utf8')
            imgLink = self.htmlDownloader.imgDownloader(link).encode('utf8')
            imgFileName = self.getFileName(imgLink)
            if title == None:
                title = ''
            if category == None:
                category = ''
            if imgFileName == None:
                imgFileName = ''
            if price == None:
                price = '0'
            data = {
                'title': title,
                'category': category,
                'imgName': imgFileName,
                'price': price
            }
            print('Current Download Status: ' + str(i))
            print data
            i = i + 1
            self.dataOutput.datas.append(data)
        self.dataOutput.output_csv(headers)

    def getFileName(self, url, file_path='', file_suffix=''):
        queryArr = url.split('/')
        strLen = len(queryArr)
        name = queryArr[strLen - 1]
        try:
            if type(int(name[0])) == int:
                name = 'a' + name
        except Exception:
            name = name
        file_name = re.sub(r'.tif\?(\D)*$', '.jpg', name)
        return '{}{}{}'.format(file_path, file_name, file_suffix)