def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() pass
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.dataoutput = DataOutput() self.mongoengine = Use_MongoEngine() self.urloutput = Url_info_Output()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断是否有新的URL及已抓取数量 while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的链接 new_url = self.manager.get_new_url() print '1-------->new_url', new_url # 下载网页 html = self.downloader.download(new_url) print '2-------->html' # 解析抽取网页 new_urls, data = self.parser.parser(new_url, html) print '3-------->new_urls, data', new_urls, data # 将抽取的URL添加到管理器中 self.manager.add_new_urls(new_urls) print '4-------->new_urls', new_urls # 数据存储器存储文件 self.output.store_data(data) print '已经抓取%d个链接' % self.manager.old_url_size() except Exception, e: print 'crawl failed %s' % e # 将数据存储为指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 200): try: #从url管理器中获取url new_url = self.manager.get_new_url() # html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url放到url管理器中 self.manager.add_new_urls(new_urls) # 将抽取的数据存储起来 self.output.store_data(data) print " 已经抽取了 %s 个链接" % self.manager.old_url_size() except Exception, e: print "crawl failed", e self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td") #将抽取到url添加到URL管理器中 if new_urls!="meiyou": self.manager.add_new_urls(new_urls) print "已经抓取%s个链接"%self.manager.old_url_size() #数据存储器储存文件 if new_th!="meiyou" and new_td!="meiyou": self.output.store_data(new_th,new_td) self.output.output_html() except Exception as e: print "抓取失败!"
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() # print(new_url, '.......') html = self.downloader.download(new_url) # print(html) new_urls, data = self.parser.parse(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取 %s 个链接' % self.manager.old_urls_size()) except Exception as e: print(e) # print('crawl failed') self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口url self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: #从url管理器获取新的url new_url = self.manager.get_new_url() #html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) #将抽取的url添加到url管理器中 # self.manager.add_new_url(new_urls) 出现set不可hash问题,因为可迭代的数据是无法hash的 for new_url in new_urls: self.manager.add_new_url(new_url) #数据存储器存储文件 self.output.store_data(data) print('已经抓取%s个链接' % self.manager.old_url_size()) except Exception as e: print('crawl failed with' + str(e))
class SpiderMan(object): def __init__(self): self.manager = UrlManger() self.downloader = HtmlDownload() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) # print(self.manager.new_url_size()) # print(self.manager.old_urls_size()) while (self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) # print(html) # print('新的url:', new_url) new_urls, data = self.parser.parser(new_url, html) # print("new_urls长度:", len(new_urls)) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取了%s个链接' % self.manager.old_urls_size()) except Exception as e: print(e, 'Crawl failed') self.output.output_html() print("已保存至 baike.html")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlOneTag(self, book_tag): page_num = 0 book_list = [] while page_num <= 2: try: new_url = self.manager.get_new_url(page_num, book_tag) html = self.downloader.download(new_url) book_list += self.parser.parser(html) except Exception as e: print("crawl failed") page_num += 1 return book_list def crawlAllTags(self, book_tag_lists, topath): book_lists = [] for book_tag in book_tag_lists: book_list = self.crawlOneTag(book_tag) book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_lists.append(book_list) self.output.output(book_lists, book_tag_lists, topath)
class SpiderMan(object): '''爬虫调度器 Attributes: manager: URL管理器 downloader: HTML下载器 parser: HTML解析器 output: 数据存储器 ''' def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): '''爬虫调度函数 Args: root_url: 爬虫入口URL Raises: Expection: 'NoneType' object has no attribute ''' self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取了%s个链接' % self.manager.old_url_size()) except Exception as e: print('Crawl failed: %s' % e) self.output.output_html()
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url,content) for url in urls: try: time.sleep(0.1) t = time.strftime("%Y%m%d%H%M%S",time.localtime()) rank_url ='http://service.library.mtime.com/Movie.api'\ '?Ajax_CallBack=true'\ '&Ajax_CallBackType=Mtime.Library.Services'\ '&Ajax_CallBackMethod=GetMovieOverviewRating'\ '&Ajax_CrossDomain=1'\ '&Ajax_RequestUrl=%s'\ '&t=%s'\ '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url,rank_content) self.output.store_data(data) except Exception,e: print 'Crawl failed' self.output.output_end() print "Crawl finish"
class SpiderRun(object): ''' 爬虫调度主程序 ''' def __init__(self): self.manager = URLManage() self.parser = HTMLParse() self.downloader = HTMLDownloader() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while self.manager.has_new_url() and self.manager.old_url_size() < 5: try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "抓取URL:", new_url print "已经抓取{}个链接".format(self.manager.old_url_size()) except Exception, e: print "Exception is:", e self.output.output_html() print self.manager.new_url_size() print self.output.datas_size()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = DataOutput() self.output = DataOutput() def crawl(self, root_url): #添加入口 self.manager.add_new_url(root_url) #判断url管理管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.hes_new_url() and self.manager.old_url_size() < 100): try: #从url管理器中获取新的url new_url = self.manager.get_new_url() #html解释器抽取网页数据 html = self.downloader.download(new_url) #将抽取的url添加到url管理器中 self.manager.add_new_url(new_url) #将数据存储到文件 self.output.stor_data(data) print("已经抓取%个链接" % self.manager.old_url_size()) except Exception, e: print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = FileDownLoader() self.parser = FileParser() self.output = DataOutput() def crawl(self, root_files): for root_file in root_files: new_urls = self.parser.parser(root_file) self.manager.add_new_urls(new_urls) while (self.manager.has_new_url()): try: new_url = self.manager.get_new_url() data = self.downloader.download(new_url) self.output.store_data(data, root_file, new_url) print("已经抓取%s个链接" % self.manager.old_url_size()) interval = random.randint(1, 3) time.sleep(interval) print("sleep: %d" % interval) except Exception as err: self.output.mark_result(root_file, new_url, False) print("crawl faild:" + str(err))
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParse() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.downloader(root_url) urls = self.parser.parser_url(root_url, content) # 构造一个获取票房连接的URL for url in urls: print '---------->URL', url, url[0], url[1] try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.downloader(rank_url) print 'ajax接口返回内容,汉字正常显示-------->', rank_content print 'ajax接口返回内容type为unicode-------->', type(rank_content) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception, e: print '获取ajax动态数据失败', e self.output.output_end() print '=======end========='
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def start(self, url, numMax=50): self.manager.addUrl(url) num = 0 errorsNum = 0 while self.manager.sizeofNew() != 0 and num < numMax: try: num = num + 1 url = self.manager.getUrl() print('%d\n %s' % (num, url)) html = self.downloader.download(url) newUrls, data = self.parser.parser(url, html) self.output.addData(data) if self.manager.sizeofNew() + self.manager.sizeofOld( ) < numMax: self.manager.addUrls(newUrls) print(data['title']) except: num = num - 1 errorsNum = errorsNum + 1 print('crawl failed %d' % errorsNum) self.output.outputData()
class Schedul(object): def __init__(self): self._manager = UrlManager() self._download = HtmlDownload() self._parse = HtmlParse() self._output = DataOutput() def crawl(self, root_url): # 入口url self._manager.add_new_url(root_url) # 判断是否还有新的url while (self._manager.has_new_url()): try: # 从URL管理器中获取URL new_url = self._manager.get_new_url() # 下载网页 response = self._download.download(new_url) # 解析抽取网页数据 new_url, data = self._parse.parse(new_url, response) # 将抽取到的新的urls添加到URL管理器中 self._manager.add_new_url(new_url) # 数据存储器存储数据 self._output.store_data(data) print('已经抓取了 %s 个链接' % self._manager.old_url_size()) print(new_url) except Exception as e: print('crawl Error', e) #数据存储器将文件输出成指定格式 try: self._output.output_db() except: print('output Error')
class SpiderMan(object): def __init__(self): self.manger = UrlManger() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): print 'crawl %s ' % root_url self.manger.add_new_url(root_url) #pdb.set_trace() while (self.manger.has_new_url() and self.manger.old_urls_size() < 100): try: new_url = self.manger.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manger.add_new_urls(new_urls) self.output.store_data(data) print 'Has crawl %s links ' % self.manger.old_urls_size() except Exception, e: print "crawl failed %s" % e break self.output.output_html()
class Spiderman(object): def __init__(self): self.manager = Urlmanager() self.downloader = HtmlDownloader() self.parser = Htmlparser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器是否有新的url,同时判断抓取了多少url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从url管理器获得新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取数据 new_urls, data = self.parser.parse(new_url, html) # 将抽取的url添加到url管理器中 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print('已经抓取%s个连接' % self.manager.old_url_size()) except Exception as e: print('crawl failed') raise # 数据存储器将文件输出指定格式 self.output.output_html()
class SpiderMan: def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 url self.manager.add_new_url(root_url) # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从 URL 管理器获取新的 url new_url = self.manager.get_new_url() # 从 html 下载器下载网页 html = self.downloader.download(new_url) # print(html) # 从 html 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的 url 添加到 URl 管理器 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
class Spider_Scheduler(object): def __init__(self): self.urlmanager = UrlQueue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 入口放url种子 self.urlmanager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlmanager.has_new_url() and self.urlmanager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlmanager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.urlmanager.add_new_urls(new_urls) # 存储器将数据序列化 self.output.data_to_list(data) print("已经抓取%s个链接" % self.urlmanager.old_url_size()) except Exception as e: print("crawl failed") # 存储器输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) if html == None: print('failded to get pages') new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('has scraped %s links' % self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html() '''
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 def crawl(self): self.output.create_table() # 创建表 self.manager.add_new_urls() # 创建url total = self.manager.new_urls_size() bar = pyprind.ProgBar(30, title="Crawling......") # 进度条 while (self.manager.new_urls_size()): url = self.manager.get_new_url() html = self.downloader.download(url) data = self.parser.parse(html) errors, errors_messages = self.output.insert_into_db( data) # 插入数据库 bar.update() ''' sys.stdout.write( str(self.manager.old_urls_size() / total * 100) + "%") sys.stdout.flush() # print('爬取', self.manager.old_urls_size(), '条。') ''' self.output.close_cursor() # 关闭数据库连接 print("本次共爬取", total, "条") if errors: print("其中", errors, "条数据出错") print("错误:" + str(errors_messages))
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器储存文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) #print(new_url) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): """爬虫调度器""" def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput() def crawl(self, root_url): # 添加入口URL self.urlManager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlManager.has_new_url() and self.urlManager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlManager.get_new_url() # HTML下载器下载网页 html = self.htmlDownloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.htmlParser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.urlManager.add_new_urls(new_urls) # 数据存储器存储数据 self.htmlOutput.store_data(data) except Exception as e: print(traceback.format_exc()) # 数据存储器将文件输出成指定格式 self.htmlOutput.output_html()
def __init__(self, sort, sort_url, sortFilename): threading.Thread.__init__(self) self.sort = sort self.sort_url = sort_url self.sortFilename = sortFilename self.manager = UrlManager(self.sort) self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def add_new_urls(self): docids = self.get_DocID() db = DataOutput() old_docids = db.get_old_docids() db.close_cursor() for docid in docids: if docid not in old_docids: # 去重 self.new_urls.add( "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + docid) print("url构造完成,准备开始爬取……")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 self.s = Settings().setting def crawl(self): self.output.create_table() # 创建表 total_page = self.s["Index"][1] - self.s["Index"][0] total_data = total_page * self.s["Page"] total_errors = 0 total_duplicates = 0 old_total = self.output.get_total() for Index in range(self.s["Index"][0], self.s["Index"][1]): duplicates = self.manager.add_urls(Index, self.output) urls = self.manager.get_urls() bar = pyprind.ProgBar(self.s["Page"] - duplicates, title="Crawling " + "Page " + str(Index) + " ......") # 进度条 for url in urls: try: bar.update() html = self.downloader.download(url) data = self.parser.parse(html) self.output.insert_into_db(data) # 插入数据库 except Exception: continue new_total = self.output.get_total() self.output.close_cursor() # 关闭数据库连接 print("本次爬取", new_total - old_total, "条")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.dataoutput = DataOutput() self.mongoengine = Use_MongoEngine() self.urloutput = Url_info_Output() def crawl(self, initial_url): #添加入口url self.urloutput.output_url_info(initial_url) self.dataoutput.output_html() self.mongoengine.count() print("一共抓了%s篇专利" % self.mongoengine.count()) '''
def store_proc(self, store_q): ''' 数据存储进程从store_q中读取数据并调用数据存储器进行存储 :param store_q: :return: ''' output = DataOutput() while True: if not store_q.empty(): data = store_q.get() if data == 'end': print('存储进程接受通知然后结束!') return output.store_data(data) else: time.sleep(0.2) pass
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html=self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "already get %s url" % self.manager.old_url_size() except Exception,e: print "crawl failed" self.output.output_html()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()