Пример #1
0
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

        pass
Пример #2
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.dataoutput = DataOutput()
     self.mongoengine = Use_MongoEngine()
     self.urloutput = Url_info_Output()
Пример #3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断是否有新的URL及已抓取数量
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的链接
                new_url = self.manager.get_new_url()
                print '1-------->new_url', new_url
                # 下载网页
                html = self.downloader.download(new_url)
                print '2-------->html'
                # 解析抽取网页
                new_urls, data = self.parser.parser(new_url, html)
                print '3-------->new_urls, data', new_urls, data
                # 将抽取的URL添加到管理器中
                self.manager.add_new_urls(new_urls)
                print '4-------->new_urls', new_urls
                # 数据存储器存储文件
                self.output.store_data(data)
                print '已经抓取%d个链接' % self.manager.old_url_size()
            except Exception, e:
                print 'crawl failed %s' % e
        # 将数据存储为指定格式
        self.output.output_html()
Пример #4
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口url
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 200):
            try:
                #从url管理器中获取url
                new_url = self.manager.get_new_url()
                # html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的url放到url管理器中
                self.manager.add_new_urls(new_urls)
                # 将抽取的数据存储起来
                self.output.store_data(data)
                print " 已经抽取了 %s 个链接" % self.manager.old_url_size()
            except Exception, e:
                print "crawl failed", e

        self.output.output_html()
Пример #5
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
    def crawl(self,root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td")
                #将抽取到url添加到URL管理器中
                if new_urls!="meiyou":
                    self.manager.add_new_urls(new_urls)
                    print "已经抓取%s个链接"%self.manager.old_url_size()
                #数据存储器储存文件
                if new_th!="meiyou" and new_td!="meiyou":
                    self.output.store_data(new_th,new_td)
                    self.output.output_html()
            except Exception as e:
                print "抓取失败!"
Пример #6
0
class SpiderMan(object):

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and
              self.manager.old_urls_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                # print(new_url, '.......')
                html = self.downloader.download(new_url)
                # print(html)
                new_urls, data = self.parser.parse(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取 %s 个链接' % self.manager.old_urls_size())
            except Exception as e:
                print(e)
                # print('crawl failed')
        self.output.output_html()
Пример #7
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器获取新的url
                new_url = self.manager.get_new_url()
                #html下载器下载网页
                html = self.downloader.download(new_url)
                #html解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到url管理器中
                # self.manager.add_new_url(new_urls)    出现set不可hash问题,因为可迭代的数据是无法hash的
                for new_url in new_urls:
                    self.manager.add_new_url(new_url)
                #数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed with' + str(e))
Пример #8
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManger()
        self.downloader = HtmlDownload()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        # print(self.manager.new_url_size())
        # print(self.manager.old_urls_size())
        while (self.manager.has_new_url()
               and self.manager.old_urls_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                # print(html)
                # print('新的url:', new_url)
                new_urls, data = self.parser.parser(new_url, html)
                # print("new_urls长度:", len(new_urls))
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取了%s个链接' % self.manager.old_urls_size())
            except Exception as e:
                print(e, 'Crawl failed')
        self.output.output_html()
        print("已保存至 baike.html")
Пример #9
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawlOneTag(self, book_tag):
        page_num = 0
        book_list = []
        while page_num <= 2:
            try:
                new_url = self.manager.get_new_url(page_num, book_tag)
                html = self.downloader.download(new_url)
                book_list += self.parser.parser(html)
            except Exception as e:
                print("crawl failed")
            page_num += 1
        return book_list

    def crawlAllTags(self, book_tag_lists, topath):
        book_lists = []
        for book_tag in book_tag_lists:
            book_list = self.crawlOneTag(book_tag)
            book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
            book_lists.append(book_list)
        self.output.output(book_lists, book_tag_lists, topath)
Пример #10
0
class SpiderMan(object):
    '''爬虫调度器
    Attributes:
        manager: URL管理器
        downloader: HTML下载器
        parser: HTML解析器
        output: 数据存储器
    '''
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        '''爬虫调度函数
        Args:
            root_url: 爬虫入口URL
        Raises:
            Expection: 'NoneType' object has no attribute
        '''
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('已经抓取了%s个链接' % self.manager.old_url_size())
            except Exception as e:
                print('Crawl failed: %s' % e)
        self.output.output_html()
Пример #11
0
class SpiderMan(object):  
  def __init__(self):
    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

  def crawl(self,root_url):
    content = self.downloader.download(root_url)
    urls = self.parser.parser_url(root_url,content)
    for url in urls:
      try:
        time.sleep(0.1)
        t = time.strftime("%Y%m%d%H%M%S",time.localtime())
        rank_url ='http://service.library.mtime.com/Movie.api'\
        '?Ajax_CallBack=true'\
        '&Ajax_CallBackType=Mtime.Library.Services'\
        '&Ajax_CallBackMethod=GetMovieOverviewRating'\
        '&Ajax_CrossDomain=1'\
        '&Ajax_RequestUrl=%s'\
        '&t=%s'\
        '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1])
        rank_content = self.downloader.download(rank_url)
        data = self.parser.parser_json(rank_url,rank_content)
        self.output.store_data(data)
      except Exception,e:
        print 'Crawl failed'
    self.output.output_end()
    print "Crawl finish"
Пример #12
0
class SpiderRun(object):
    '''
    爬虫调度主程序
    '''
    def __init__(self):
        self.manager = URLManage()
        self.parser = HTMLParse()
        self.downloader = HTMLDownloader()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while self.manager.has_new_url() and self.manager.old_url_size() < 5:
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "抓取URL:", new_url
                print "已经抓取{}个链接".format(self.manager.old_url_size())
            except Exception, e:
                print "Exception is:", e
        self.output.output_html()
        print self.manager.new_url_size()
        print self.output.datas_size()
Пример #13
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = DataOutput()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口
        self.manager.add_new_url(root_url)
        #判断url管理管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.hes_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从url管理器中获取新的url
                new_url = self.manager.get_new_url()
                #html解释器抽取网页数据
                html = self.downloader.download(new_url)
                #将抽取的url添加到url管理器中
                self.manager.add_new_url(new_url)
                #将数据存储到文件
                self.output.stor_data(data)
                print("已经抓取%个链接" % self.manager.old_url_size())
            except Exception, e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = FileDownLoader()
        self.parser = FileParser()
        self.output = DataOutput()

    def crawl(self, root_files):

        for root_file in root_files:
            new_urls = self.parser.parser(root_file)
            self.manager.add_new_urls(new_urls)

            while (self.manager.has_new_url()):
                try:
                    new_url = self.manager.get_new_url()
                    data = self.downloader.download(new_url)
                    self.output.store_data(data, root_file, new_url)
                    print("已经抓取%s个链接" % self.manager.old_url_size())

                    interval = random.randint(1, 3)

                    time.sleep(interval)
                    print("sleep: %d" % interval)

                except Exception as err:
                    self.output.mark_result(root_file, new_url, False)
                    print("crawl faild:" + str(err))
Пример #15
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParse()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.downloader(root_url)
        urls = self.parser.parser_url(root_url, content)
        # 构造一个获取票房连接的URL
        for url in urls:
            print '---------->URL', url, url[0], url[1]
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.downloader(rank_url)
                print 'ajax接口返回内容,汉字正常显示-------->', rank_content
                print 'ajax接口返回内容type为unicode-------->', type(rank_content)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception, e:
                print '获取ajax动态数据失败', e
        self.output.output_end()
        print '=======end========='
Пример #16
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def start(self, url, numMax=50):
        self.manager.addUrl(url)
        num = 0
        errorsNum = 0
        while self.manager.sizeofNew() != 0 and num < numMax:
            try:
                num = num + 1
                url = self.manager.getUrl()
                print('%d\n %s' % (num, url))
                html = self.downloader.download(url)
                newUrls, data = self.parser.parser(url, html)
                self.output.addData(data)
                if self.manager.sizeofNew() + self.manager.sizeofOld(
                ) < numMax:
                    self.manager.addUrls(newUrls)
                print(data['title'])
            except:
                num = num - 1
                errorsNum = errorsNum + 1
                print('crawl failed %d' % errorsNum)
        self.output.outputData()
Пример #17
0
class Schedul(object):
    def __init__(self):
        self._manager = UrlManager()
        self._download = HtmlDownload()
        self._parse = HtmlParse()
        self._output = DataOutput()

    def crawl(self, root_url):
        # 入口url
        self._manager.add_new_url(root_url)
        # 判断是否还有新的url
        while (self._manager.has_new_url()):
            try:
                # 从URL管理器中获取URL
                new_url = self._manager.get_new_url()
                # 下载网页
                response = self._download.download(new_url)
                # 解析抽取网页数据
                new_url, data = self._parse.parse(new_url, response)
                # 将抽取到的新的urls添加到URL管理器中
                self._manager.add_new_url(new_url)
                # 数据存储器存储数据
                self._output.store_data(data)
                print('已经抓取了 %s 个链接' % self._manager.old_url_size())
                print(new_url)
            except Exception as e:
                print('crawl Error', e)
        #数据存储器将文件输出成指定格式
            try:
                self._output.output_db()
            except:
                print('output Error')
Пример #18
0
class SpiderMan(object):

    def __init__(self):
        self.manger = UrlManger()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        print 'crawl %s ' % root_url
        self.manger.add_new_url(root_url)

        #pdb.set_trace()
        while (self.manger.has_new_url() and self.manger.old_urls_size() < 100):
            try:
                new_url = self.manger.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manger.add_new_urls(new_urls)
                self.output.store_data(data)
                print 'Has crawl %s links ' % self.manger.old_urls_size()
            except Exception, e:
                print "crawl failed %s" % e
                break

        self.output.output_html()
Пример #19
0
class Spiderman(object):
    def __init__(self):
        self.manager = Urlmanager()
        self.downloader = HtmlDownloader()
        self.parser = Htmlparser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器是否有新的url,同时判断抓取了多少url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从url管理器获得新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取数据
                new_urls, data = self.parser.parse(new_url, html)
                # 将抽取的url添加到url管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print('已经抓取%s个连接' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
                raise
            # 数据存储器将文件输出指定格式
        self.output.output_html()
Пример #20
0
class SpiderMan:
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口 url
        self.manager.add_new_url(root_url)
        # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url
        while self.manager.has_new_url() and self.manager.old_url_size() < 100:
            try:
                # 从 URL 管理器获取新的 url
                new_url = self.manager.get_new_url()
                # 从 html 下载器下载网页
                html = self.downloader.download(new_url)
                # print(html)
                # 从 html 解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取的 url 添加到 URl 管理器
                self.manager.add_new_urls(new_urls)
                # 数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        self.output.output_html()
Пример #21
0
class Spider_Scheduler(object):
    def __init__(self):
        self.urlmanager = UrlQueue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 入口放url种子
        self.urlmanager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlmanager.has_new_url()
               and self.urlmanager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlmanager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.urlmanager.add_new_urls(new_urls)
                # 存储器将数据序列化
                self.output.data_to_list(data)
                print("已经抓取%s个链接" % self.urlmanager.old_url_size())
            except Exception as e:
                print("crawl failed")
        # 存储器输出成指定格式
        self.output.output_html()
Пример #22
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                if html == None:
                    print('failded to get pages')
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('has scraped %s links' % self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
        '''
Пример #23
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()  # 实例化时连接到数据库

    def crawl(self):
        self.output.create_table()      # 创建表
        self.manager.add_new_urls()     # 创建url
        total = self.manager.new_urls_size()
        bar = pyprind.ProgBar(30, title="Crawling......")   # 进度条
        while (self.manager.new_urls_size()):
            url = self.manager.get_new_url()
            html = self.downloader.download(url)
            data = self.parser.parse(html)
            errors, errors_messages = self.output.insert_into_db(
                data)        # 插入数据库
            bar.update()
            '''
            sys.stdout.write(
                str(self.manager.old_urls_size() / total * 100) + "%")
            sys.stdout.flush()
            # print('爬取', self.manager.old_urls_size(), '条。')
            '''
        self.output.close_cursor()  # 关闭数据库连接
        print("本次共爬取", total, "条")
        if errors:
            print("其中", errors, "条数据出错")
            print("错误:" + str(errors_messages))
Пример #24
0
class SpiderMain(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.manager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                # HTML下载器下载网页
                html = self.downloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                # 将抽取到url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                # 数据存储器储存文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
                #print(new_url)
            except Exception as e:
                print("crawl failed")
            # 数据存储器将文件输出成指定格式
        self.output.output_html()
Пример #25
0
class SpiderMan(object):
    """爬虫调度器"""
    def __init__(self):
        self.urlManager = UrlManager()
        self.htmlDownloader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.htmlOutput = DataOutput()

    def crawl(self, root_url):
        # 添加入口URL
        self.urlManager.add_new_url(root_url)
        # 判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.urlManager.has_new_url()
               and self.urlManager.old_url_size() < 100):
            try:
                # 从URL管理器获取新的url
                new_url = self.urlManager.get_new_url()
                # HTML下载器下载网页
                html = self.htmlDownloader.download(new_url)
                # HTML解析器抽取网页数据
                new_urls, data = self.htmlParser.parser(new_url, html)
                # 将抽取的url添加到URL管理器中
                self.urlManager.add_new_urls(new_urls)
                # 数据存储器存储数据
                self.htmlOutput.store_data(data)
            except Exception as e:
                print(traceback.format_exc())
        # 数据存储器将文件输出成指定格式
        self.htmlOutput.output_html()
Пример #26
0
    def __init__(self, sort, sort_url, sortFilename):
        threading.Thread.__init__(self)
        self.sort = sort
        self.sort_url = sort_url
        self.sortFilename = sortFilename

        self.manager = UrlManager(self.sort)
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
Пример #27
0
 def add_new_urls(self):
     docids = self.get_DocID()
     db = DataOutput()
     old_docids = db.get_old_docids()
     db.close_cursor()
     for docid in docids:
         if docid not in old_docids:  # 去重
             self.new_urls.add(
                 "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID="
                 + docid)
     print("url构造完成,准备开始爬取……")
Пример #28
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()  # 实例化时连接到数据库
        self.s = Settings().setting

    def crawl(self):
        self.output.create_table()  # 创建表
        total_page = self.s["Index"][1] - self.s["Index"][0]
        total_data = total_page * self.s["Page"]
        total_errors = 0
        total_duplicates = 0
        old_total = self.output.get_total()

        for Index in range(self.s["Index"][0], self.s["Index"][1]):
            duplicates = self.manager.add_urls(Index, self.output)
            urls = self.manager.get_urls()
            bar = pyprind.ProgBar(self.s["Page"] - duplicates,
                                  title="Crawling " + "Page " + str(Index) +
                                  " ......")  # 进度条
            for url in urls:
                try:
                    bar.update()
                    html = self.downloader.download(url)
                    data = self.parser.parse(html)
                    self.output.insert_into_db(data)  # 插入数据库
                except Exception:
                    continue
        new_total = self.output.get_total()
        self.output.close_cursor()  # 关闭数据库连接

        print("本次爬取", new_total - old_total, "条")
Пример #29
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.dataoutput = DataOutput()
        self.mongoengine = Use_MongoEngine()
        self.urloutput = Url_info_Output()

    def crawl(self, initial_url):
        #添加入口url
        self.urloutput.output_url_info(initial_url)
        self.dataoutput.output_html()
        self.mongoengine.count()
        print("一共抓了%s篇专利" % self.mongoengine.count())
        '''
Пример #30
0
 def store_proc(self, store_q):
     '''
     数据存储进程从store_q中读取数据并调用数据存储器进行存储
     :param store_q:
     :return:
     '''
     output = DataOutput()
     while True:
         if not store_q.empty():
             data = store_q.get()
             if data == 'end':
                 print('存储进程接受通知然后结束!')
                 return
             output.store_data(data)
         else:
             time.sleep(0.2)
     pass
Пример #31
0
class SpiderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()
                html=self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print "already get %s url" % self.manager.old_url_size()
            except Exception,e:
                print "crawl failed"
        self.output.output_html()
Пример #32
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()