def __init__(self, url): self.root_url = url self.urlManager = UrlManager() self.dLoader = HtmlDLoader() self.contParser = HtmlParser() self.contOutputer = HtmlOutputer() pass
def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, url): count = 1 self.urls.add_new_url(url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() html_cont = self.downloader.download(new_url) new_urls, html_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(html_data) print "%d craw success : %s" % (count, new_url) if count >= 10: break count = count + 1 except Exception as e: print str(e) print "%d craw failed : %s" % (count, new_url) self.outputer.output()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = HtmlOutputer() def craw(self, root_url, page_amount=5, time_sleep=None): count = 1 # 添加第一个待爬取url self.urls.add_new_url(root_url) # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。 while self.urls.has_new_url(): try: # 开始爬取 new_url = self.urls.get_new_url() print(f'craw{count}:{new_url}') # 请求url, 返回html html_content = self.downloader.download(new_url) # xpath 解析html,得到需要的数据 new_urls, new_data = self.parser.parse(html_content) # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取 self.urls.add_new_urls(new_urls) self.output.collect_data(new_url, new_data) count += 1 if count > page_amount: break time.sleep(2) except Exception as e: print(e) print(f'抓取失败:{new_url}') self.output.output_html()
def __init__(self): # URL 管理器 # self.urls = UrlManager.UrlManager() self.urls = UrlManager() # URL 下载器 # self.downloader = HtmlDownloader.HtmlDownloader() self.downloader = HtmlDownloader() # URL 解析器 # self.parser = html_parser.HtmlParser() self.parser = HtmlParser() # self.outputer = html_outputer.HtmlOutputer() self.outputer = HtmlOutputer()
class SpiderMain(): def __init__(self): # URL 管理器 # self.urls = UrlManager.UrlManager() self.urls = UrlManager() # URL 下载器 # self.downloader = HtmlDownloader.HtmlDownloader() self.downloader = HtmlDownloader() # URL 解析器 # self.parser = html_parser.HtmlParser() self.parser = HtmlParser() # self.outputer = html_outputer.HtmlOutputer() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 originSet = set() originSet.add(root_url) self.urls.add_new_urls(originSet) while self.urls.has_new_rul(): try: new_url = self.urls.get_new_url() print "craw %d : %s" % (count, new_url) html_cont = self.downloader.downloader(new_url) # 输出信息 downStat = "ERROR" if html_cont != None: downStat = "SUCCESS" print "[Page ID : %d downloader %s!]" % (count, downStat) new_urls, new_data = self.parser.parser(new_url, html_cont) # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 15: break count = count + 1 except Exception as err: print "craw failed! ERROR infomation : %s" % err self.outputer.output_html()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downlaoder = HtmlDownlaoder() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 # 把根url 传入url管理列表 self.urls.add_url(root_url) # 页面爬取循环程序 while self.urls.has_new_url(): try: # 获取一个待爬取的url new_url = self.urls.get_new_url() print('craw %d: %s' % (count, new_url)) # 下载该url爬取context html_cont = self.downlaoder.download(new_url) # 通过解析器,解析该url下载到的内容,获取新的 new_urls 和 新的 data new_urls, new_data = self.parser.parse(new_url, html_cont) # 把获取到 新的url添加到新的url管理器, self.urls.add_new_urls(new_urls) # 把获取的新的data添加到新的数据处理器中 self.outputer.collect_data(new_data) if count == 100: break count += 1 except Exception as e: print('craw failed') self.outputer.output_html()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() # 获取新url html_cont = self.downloader.download(new_url) # 下载url内容 new_urls, new_data = self.parser.parse(new_url, html_cont) # 解析url内容 self.urls.add_new_urls(new_urls) # 将解析到的新url存入url管理器 self.outputer.collect_data(new_data) # 收集解析到的数据 if count == 200: break count = count + 1 except: print("craw failed") self.outputer.output_html()
class SpiderMain(object): """docstring for SpiderMain""" def __init__(self): self.urlManage = UrlManage() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self,url): self.urlManage.add_new_url(url) count = 1 while self.urlManage.has_new_url(): url = self.urlManage.get_new_url() print '%dth page,address:%s' % (count,url) html_content = self.downloader.downloadPage(url) new_urls,new_data = self.parser.parse(html_content,url) self.urlManage.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 10: break count = count + 1 self.outputer.output_html()
class SpiderMain(): """爬虫程序主模块""" def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() #self.util=utill.DBConn() def craw(self, root_url): """爬虫入口函数""" areas = { "gulou": 100, "jianye": 72, "qinhuai": 100, "xuanwu": 67, "yuhuatai": 32, "qixia": 62, "baijiahu": 33, "chalukou1": 26, "jiangningqita11": 3, "dongshanzhen": 29, "jiangningdaxuecheng": 15, "jiulonghu": 12, "jiangjundadao11": 22, "kexueyuan": 9, "qilinzhen": 42, "tiexinqiao": 9, "pukou": 100, "liuhe": 1, } #areas = {"gulou":1} #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块 for area, pg_sum in areas.items(): for num in range(1, pg_sum + 1): #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/ pg_url = root_url + area + "/pg" + str(num) + "/" self.log.logger.info("1.1 拼接页面地址:" + pg_url) print("1.1 拼接页面地址:" + pg_url) #1.2 启动下载器,下载页面. try: html_cont = self.downloader.download(pg_url) except Exception as e: self.log.logger.error("1.2 下载页面出现异常:" + repr(e)) time.sleep(60 * 30) else: #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块 try: ershoufang_urls = self.parser.get_erhoufang_urls( html_cont) except Exception as e: self.log.logger.error("1.3 页面解析出现异常:" + repr(e)) else: self.urls.add_new_urls(ershoufang_urls) #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) time.sleep(60 * 20) #2、解析二手房具体细心页面 id = 1 stop = 1 while self.urls.has_new_url(): #2.1 获取url try: detail_url = self.urls.get_new_url() self.log.logger.info("2.1 二手房页面地址:" + detail_url) print("2.1 二手房页面地址:" + detail_url) except Exception as e: print("2.1 拼接地址出现异常") self.log.logger.error("2.1 拼接地址出现异常:" + detail_url) #2.2 下载页面 try: detail_html = self.downloader.download(detail_url) except Exception as e: self.log.logger.error("2.2 下载页面出现异常:" + repr(e)) self.urls.add_new_url(detail_url) time.sleep(60 * 30) else: #2.3 解析页面 try: ershoufang_data = self.parser.get_ershoufang_data( detail_html, id) except Exception as e: self.log.logger.error("2.3 解析页面出现异常:" + repr(e)) else: #2.4 输出数据 try: self.outputer.collect_data(ershoufang_data) except Exception as e: self.log.logger.error("2.4 输出数据出现异常:" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) if stop == 2500: stop = 1 time.sleep(60 * 20)
def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()
class GrabMain(object): def __init__(self, url): self.root_url = url self.urlManager = UrlManager() self.dLoader = HtmlDLoader() self.contParser = HtmlParser() self.contOutputer = HtmlOutputer() pass def grabText(self): if self.root_url is None: return self.urlManager.add_new_next_url(self.root_url) self.contParser.parser_set(None, None, None, None, None) while self.urlManager.get_new_next_count(): try: new_url = self.urlManager.get_new_next_url() html_cont = self.dLoader.download(new_url) urls, nexts = self.contParser.parser_text_urls(html_cont) self.urlManager.add_new_next_urls(nexts) self.urlManager.add_new_urls(urls) except: print "url is error." pool = threadpool.ThreadPool(10) requests = threadpool.makeRequests(self.thread_grabText, self.urlManager.new_urls) [pool.putRequest(req) for req in requests] pool.wait() def thread_grabText(self, url): try: print "curr url is %s." % url html_cont = self.dLoader.download(url) title, cont = self.contParser.parser_text_cont(html_cont) self.contOutputer.output_cont(title, cont) except: print "url is %s, error." % url def grabImgs(self): if self.root_url is None: return None self.urlManager.add_new_next_url(self.root_url) self.contParser.parser_set(None, None, None, None, None) while self.urlManager.get_new_next_count(): try: new_url = self.urlManager.get_new_next_url() html_cont = self.dLoader.download(new_url) urls, nexts = self.contParser.parser_text_urls(html_cont) self.urlManager.add_new_next_urls(nexts) self.urlManager.add_new_urls(urls) except: print "url is error." pool = threadpool.ThreadPool(10) requests = threadpool.makeRequests(self.thread_grabImg, self.urlManager.new_urls) [pool.putRequest(req) for req in requests] pool.wait() def thread_grabImg(self, url): try: print "curr url is %s." % url html_cont = self.dLoader.download(url) title, links = self.contParser.parser_img_cont(html_cont) if links is None or len(links) == 0: print "url is %s, not src." % url return None if title is None: title = time.time() try: if not os.path.isdir(title): os.mkdir(title) except: title = time.time() if not os.path.isdir(title): os.mkdir(title) params = [] index = 0 for link in links: params.append(([title, link, index], None)) index += 1 pool = threadpool.ThreadPool(12) requests = threadpool.makeRequests(self.contOutputer.output_img, params) [pool.putRequest(req) for req in requests] pool.wait() except: print "url is %s, error." % url
def __init__(self): self.urlManage = UrlManage() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()