Пример #1
0
class SpiderMan(object):
    #进行类的初始化
  def __init__(self):
    self.manager = UrlManager()
    self.downloader = HtmlDownLoader()
    self.parser = HtmlParser()
    self.output = DataOutput()
  def crawl(self,root_url):
    #添加入口url
    self.manager.add_new_url(root_url)
    
    #判断url管理器中是否有新的url,同时判断抓取了多少个url
    while (self.manager.has_new_url() and self.manager.old_url_size() < 20):
      time.sleep(1)
      try:
        #从url管理器中获取新的url
        new_url = self.manager.get_new_url()
        #HTML 下载器下载页面
        html = self.downloader.download(new_url)
        #HTML 解析器抽取页面数据
        new_urls,data = self.parser.parser(new_url,html)
        #将抽取的url 添加到url管理器中
        self.manager.add_new_urls(new_urls)
        #数据存储器存储文件
        self.output.store_data(data)
        print '已经抓取%s个链接' % self.manager.old_url_size()
      except Exception,e:  #Exception	常规错误的基类
        print 'crawl failed'
    #数据存储器将文件输出成指定格式
    self.output.output_html()
Пример #2
0
class SpiderMan(object):
	def _init_(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
	def crawl(self,root_url):
		#添加入口URL
		self.manager.add_new_url(root_url)
		#判断url管理器中是否有新的url,同时判断抓取了多少个url
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				#从URL管理器获取新的url
				new_url = self.manager.get_new_url()
				#HTML下载器下载网页
				html = self.downloader.download(new_url)
				#HTML解析器抽取网页数据
				new_urls,data = self.parser.parser(new_url,html)
				#将抽取的url添加到URL管理器中
				self.output.store_data(data)
				print "已经抓取%s个链接"%self.manager.old_url_size()
			except Exception,e:
				print "crawl failed"
				#数据存储器将文件输出成指定格式
		self.output.output_html()
	def store_proc(self, store_q):
		output = DataOutput()
		while True:
			if not store_q.empty():
				data= store_q.get()
				if data == 'end':
					print('储存进度接受通知然后结束')
					output.output_end(output.filepath())
					
					return
				output.store_data(data)
			else:
				time.sleep(0.1)
Пример #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.output = DataOutput()
        self.downloader = HtmlDownloader
        self.parser = HtmlParser()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url() and self.manager.get_old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = HtmlDownloader.download(new_url)
                data = self.parser.parser(new_url, html)
                # self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                # print '已经抓取%s个链接' % self.manager.get_old_url_size()
            except Exception, e:
                print 'craml execption %s' % e
        self.output.output_html()
class SpiderMan(object):
	def __init__(self):
		self.manager = UrlManager()
		self.downloader = HtmlDownloader()
		self.parser = HtmlParser()
		self.output = DataOutput()
		
	def crawl(self, root_url):
		#调用函数
		self.manager.add_new_url(root_url)
		while(self.manager.has_new_url() and self.manager.old_url_size()<100):
			try:
				new_url = self.manager.get_new_url()
				html = self.downloader.download(new_url)
				new_urls, data = self.parser.parser(new_url, html)
				self.manager.add_new_urls(new_urls)
				self.output.store_data(data)
				print("已经抓取%s个链接"%self.manager.old_url_size())
			except:
				print("crawl failed")
		self.output.ouput_html()
Пример #6
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_url_to_undo_urls(root_url)
        while (self.manager.is_or_not_new_url()
               and self.manager.done_urls_size() < 100):
            try:
                new_url = self.manager.get_undo_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                # print(new_urls)
                self.manager.add_urls_to_undo_urls(new_urls)
                self.output.store_data(data)
                print("已抓取%s个链接" % self.manager.done_urls_size())
            except Exception as e:
                print("crawl failed!")
        self.output.output_html()