class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器储存文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) #print(new_url) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): '''爬虫调度器 Attributes: manager: URL管理器 downloader: HTML下载器 parser: HTML解析器 output: 数据存储器 ''' def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): '''爬虫调度函数 Args: root_url: 爬虫入口URL Raises: Expection: 'NoneType' object has no attribute ''' self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取了%s个链接' % self.manager.old_url_size()) except Exception as e: print('Crawl failed: %s' % e) self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_urls_size() < 100): try: new_url = self.manager.get_new_url() # print(new_url, '.......') html = self.downloader.download(new_url) # print(html) new_urls, data = self.parser.parse(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取 %s 个链接' % self.manager.old_urls_size()) except Exception as e: print(e) # print('crawl failed') self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_th, new_td, new_urls= self.parser.parser(new_url,html,"th","时间","td") #将抽取到url添加到URL管理器中 if new_urls!="meiyou": self.manager.add_new_urls(new_urls) print "已经抓取%s个链接"%self.manager.old_url_size() #数据存储器储存文件 if new_th!="meiyou" and new_td!="meiyou": self.output.store_data(new_th,new_td) self.output.output_html() except Exception as e: print "抓取失败!"
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口url self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: #从url管理器获取新的url new_url = self.manager.get_new_url() #html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) #将抽取的url添加到url管理器中 # self.manager.add_new_url(new_urls) 出现set不可hash问题,因为可迭代的数据是无法hash的 for new_url in new_urls: self.manager.add_new_url(new_url) #数据存储器存储文件 self.output.store_data(data) print('已经抓取%s个链接' % self.manager.old_url_size()) except Exception as e: print('crawl failed with' + str(e))
def url_manager_proc(url_q, conn_q, root_url, num=6): """ :param url_q:里面放的是url集合单个url :param conn_q:里面放的是url集合 :param root_url: :param num: :return: """ url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): print("# url_manager_proc将要爬取的url放入url_q中") new_url = url_manager.get_new_url() print(new_url) url_q.put(new_url) if url_manager.old_url_size() > num: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储 set 状态 url_manager.save_progress() break try: if not conn_q.empty(): print("# url_manager_proc从conn_q中拿取urls") urls = conn_q.get() print(urls) url_manager.add_new_urls(urls) else: # 延时休息 time.sleep(0.1) except Exception as e: print(e)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) if html == None: print('failded to get pages') new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('has scraped %s links' % self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html() '''
class EySpider(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def urlsCrawl(self, root_url): #主要用来获取链接 self.manager.add_new_url(root_url) #判断url管理器中是否有新的url并且可以规定爬取url的数量 #self.manager.old_url_size()<*** while (self.manager.has_new_url()): try: #从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() #下载页面 html = self.downloader.staticPageDownload(new_url) #获取到新的urls urls = self.parser.urlsparser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取链接失败") def keywordsCrawl(self): while (self.manager.has_new_url()): try: # 从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() # 下载页面 html = self.downloader.staticPageDownload(new_url) # 获取到新的urls keywords = self.parser.Parser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取关键字失败")
class SpiderMan: def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 url self.manager.add_new_url(root_url) # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从 URL 管理器获取新的 url new_url = self.manager.get_new_url() # 从 html 下载器下载网页 html = self.downloader.download(new_url) # print(html) # 从 html 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的 url 添加到 URl 管理器 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print(self.manager.old_url_size()) print(data) except Exception as e: print('crawl failed') self.output.output_question() self.output.output_answer()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = DataOutput() self.output = DataOutput() def crawl(self, root_url): #添加入口 self.manager.add_new_url(root_url) #判断url管理管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.hes_new_url() and self.manager.old_url_size() < 100): try: #从url管理器中获取新的url new_url = self.manager.get_new_url() #html解释器抽取网页数据 html = self.downloader.download(new_url) #将抽取的url添加到url管理器中 self.manager.add_new_url(new_url) #将数据存储到文件 self.output.stor_data(data) print("已经抓取%个链接" % self.manager.old_url_size()) except Exception, e: print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 200): try: #从url管理器中获取url new_url = self.manager.get_new_url() # html下载器下载网页 html = self.downloader.download(new_url) #html解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url放到url管理器中 self.manager.add_new_urls(new_urls) # 将抽取的数据存储起来 self.output.store_data(data) print " 已经抽取了 %s 个链接" % self.manager.old_url_size() except Exception, e: print "crawl failed", e self.output.output_html()
def url_manager_proc(self, url_que, conn_que, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() # 将新的url发给工作节点 url_que.put(new_url) print('old_url=', url_manager.old_urls_size()) if url_manager.old_urls_size() > 2000: url_que.put('end') print('控制节点发出结束通知') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc 获取的urls添加到URL管理器 try: if not conn_que.empty(): urls = conn_que.get() for url in urls: url_manager.add_new_url(url) except BaseException: time.sleep(0.1)
def url_manager_proc(self, task_queue, url_queue, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: if url_manager.has_new_url(): new_url = url_manager.get_new_url() print('url: %s放入任务队列' % new_url) task_queue.put(new_url) if not url_queue.empty(): next_url = url_queue.get() url_manager.add_new_url(next_url)
def crawl_image(self, start_url, total_page, __page=2): ''' 爬取蜂鸟大师板块和技法板块的画集 :parameter: :start_url 参数为需要下载的文章URL :total_page 下载页数 :__page 扩展页数起始参数,用户请勿设定 :return:无 ''' manager = UrlManager() # 添加入口URL if 'image' in start_url or 'academy' in start_url: manager.add_new_url(start_url) # 判断url管理器中是否有新的url while (manager.has_new_url()): try: # 从URL管理器获取新的url new_url = manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # 通过关键词判断是否是二级网页 if 'slide' in new_url: # HTML解析器抽取二级网页数据 data = self.parser.parse_data(html) self.crawl_items(data) else: # HTML解析器抽取一级网页数据 data = self.parser.parse_urls(html) manager.add_new_urls(data) except Exception as e: print('爬取失败==>', e) # 爬取后续页数 if __page <= total_page: if 'image' in start_url: next_url = '%s/index.php?action=getList&class_id=192&sub_classid=0&page=%s¬_in_id=' % ( start_url, str(__page)) elif 'academy' in start_url: next_url = '%s/index.php?action=getList&class_id=190&sub_classid=0&page=%s¬_in_id=' % ( start_url, str(__page)) print('开始爬取==>第', str(__page), '页') return self.crawl_image(next_url, total_page, __page + 1) else: print('网址有错误,请检查')
def url_manager_proc(self,url_q,conn_q,root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while(url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=',url_manager.old_url_size()) if(url_manager.old_url_size()>30): url_q.put('end') print('控制节点发起结束通知!') url_manager.save_progress('new_urls.txt',url_manager.new_urls) url_manager.save_progress('old_urls.txt',url_manager.old_urls) return try: urls = conn_q.get() url_manager.add_new_urls(urls) except: time.sleep(0.1)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html=self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "already get %s url" % self.manager.old_url_size() except Exception,e: print "crawl failed" self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口URL self.manager.add_new_url(root_url) #先添加第一个链接到为处理的列表中 #判断url管理器中是否有新的url,同时判断抓取了多少个url while ( self.manager.has_new_url() and self.manager.old_url_size() < 100 ): #第一个页面爬出来的链接已经占用了70多个,导致循环后面获取的链接无法被使用过,n为循环次数,m为一个页面爬出的链接,爬出所有的内容=n*m;需要优化,应该每个页面爬出所有链接后,循环爬出那些链接的内容,然后进行下一个循环,即为二次循环才能满足 try: #从URL管理器获取新的url new_url = self.manager.get_new_url() print(new_url) #HTML下载器下载网页 html = self.downloader.download(new_url) #下载整个列表的html内容 #HTML解析器抽取网页数据 new_urls, data = self.parser.parser( new_url, html) #解析每个html页面的内容,获取所有的链接,还有一段内容 print(new_urls) print( len(new_urls) ) #每次解析html的url列表都很多,都插入到未处理的url集合里面,但是只循环100次,导致后面循环爬到的url未被使用过 print(data) #将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) #新的url集合插入未处理的url里面 #数据存储器储存文件 self.output.store_data(data) #data插入显示的文件 print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print(e) print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawler(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已抓取{0}个链接".format(self.manager.old_url_size())) except Exception as e: print("crawler failed {0}".format(e)) self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 URl self.manager.add_new_url(root_url) n = 0 # 判断 URL 管理器中是否有新的url,同时判断抓取量多少个 URl while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从 URl 管理器获取新的 url new_url = self.manager.get_new_url() # 从 Html 下载器下载网页 html = self.downloader.download(new_url) # HTML 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) if n == 0: # 将抽取的 urls 添加到URL 管理器中 self.manager.add_new_urls(new_urls) # 数据存储器 存储文件 self.output.store_data(data) n += 1 print('已经抓取%s个连接' % self.manager.old_url_size()) except Exception as e: print(e) # 数据存储器将文件输出成指定格式 self.output.output_html(self.output.filepath) self.output.output_end(self.output.filepath)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = Dataoutput() def crawl(self,root_utl): self.manager.add_new_url(root_utl) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "已经抓取%s个链接" % self.manager.old_url_size() except Exception as e: # print 'crawl failed' print e self.output.output_html()
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=', url_manager.old_url_size()) if (url_manager.old_url_size() > 2000): url_q.put('end') print('控制节点发起通知') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls() except BaseException as e: print(e) time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url): """从conn_q队列获取新URL到URL管理器, 取URL放入url_q供爬虫节点获取""" url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) logging.info("old_url_size = %s " % url_manager.old_url_size()) if url_manager.old_url_size() > 50: url_q.put("end") logging.info("控制节点发起结束通知") url_manager.save_process("new_urls.txt", url_manager.new_urls) url_manager.save_process("old_urls.txt", url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
def url_manager_proc(self, url_q, conn_q, root_url, num=200): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) if url_manager.old_url_size() > num: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储 set 状态 url_manager.save_progress() return # 没有url了就从conn_q里拿 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) else: # 延时休息 time.sleep(0.1) except Exception as e: print(e)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() print("url " + new_url) url_q.put(new_url) # print("old_url=",url_manager.old_url_size()) if url_manager.old_url_size() > 2000: url_q.put("end") print("控制节点发起结束通知!") url_manager.save_progress("new_urls.txt", url_manager.new_urls) url_manager.save_progress("old_urls.txt", url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() # print(urls) url_manager.add_new_urls(urls) except BaseException: time.sleep(0.1)
class SxsIntern(object): def __init__(self): self.manger = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def get_collect_intern_urls(self, username, password): collect_url = '{0}/my/collect'.format(self.main_url) self.downloader.login(username, password) response = self.downloader.get(collect_url, session=self.downloader.session) page_num = self.parser.get_collect_page_num(response) print(f'预计下载 {page_num} 页') for i in range(1, int(page_num) + 1): page_url = '{0}?p={1}'.format(collect_url, i) response = self.downloader.get(page_url, session=self.downloader.session) urls = self.parser.get_intern_urls(response, 'collect') self.manger.add_new_urls(urls) def get_job_urls(self, job='数据', city='北京', pages=100, release_time='ft-wek'): """ 爬取指定的job信息 Parameters ---------- job: 职位信息,搜索关键字 city: 所在城市,默认'北京' pages: 设定爬取多少页信息,默认为100,如果页面不足则以实际页面为准 release_time: 发布时间,默认为'ft-wek',即获取一周内发布的职位,具体参数为: 'ft-day': 一天内 'ft-wek': 一周内 'ft-mon': 一月内 """ # ft-day, ft-wek, ft-mon city_code = self.parser.get_city_code(city) if release_time not in ['ft-day', 'ft-wek', 'ft-mon']: raise ValueError( 'release_time 应为 ["ft-day", "ft-wek", "ft-mon"] 之一') page = 1 url = '{url}/interns/st-intern_c-{c}_{r}?k={k}&p={p}'.format( url=self.main_url, r=release_time, c=city_code, k=job, p=page) response = requests.get(url, headers=self.headers) # 获得总页数 page_num = re.search(r'<a href=\".*?p=(.*?)\">尾页', response.text).group(1) page_num = min(int(page_num), int(pages)) print(f'预计下载 {page_num} 页') response.close() # 逐页处理 for page in range(1, page_num + 1): url = '{url}/interns/st-intern_c-{c}_{r}?k={k}&p={p}'.format( url=self.main_url, r=release_time, c=city_code, k=job, p=page) response = requests.get(url, headers=self.headers) links = self._get_internlinks(response, 'jobs') self._links_parse(links) response.close() def crawl(self, root_url, save_path=None, max_amount=100): self.manger.add_new_url(root_url) while (self.manger.has_new_url() and self.manger.old_url_size() < max_amount): try: new_url = self.manger.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manger.add_new_urls(new_urls) self.output.store_data(data) print('Already found {} urls'.format(self.manger.old_url_size)) except Exception as e: print('Crawl failed') self.output.write(save_path)