def work(self, task_queue, result_queue, url_queue): downloader = HtmlDownloader() parser = HtmlParser() while True: while not task_queue.empty(): new_url = task_queue.get() print('获得新任务: %s' % new_url) response = downloader.download(new_url) items = parser.parser(response) if len(items) > 1: for i in range(0, 60): product_rate_url = items[i].get('product_rate_url') print('获得链接:%s' % product_rate_url) other_store_url = items[i].get('other_store_url') print('获得链接:%s' % other_store_url) url_queue.put(product_rate_url) url_queue.put(other_store_url) print('获取结果:%s' % str(items[i])) result_queue.put(items[i]) next_page_url = items[-1] if next_page_url == 'No next page': print('已经爬取到最后一页,工作节点准备结束') result_queue.put('end') return url_queue.put(next_page_url) print('获得链接:%s' % next_page_url) else: print('获取结果:%s' % str(items[0])) result_queue.put(items[0])
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 self.s = Settings().setting def crawl(self): self.output.create_table() # 创建表 total_page = self.s["Index"][1] - self.s["Index"][0] total_data = total_page * self.s["Page"] total_errors = 0 total_duplicates = 0 old_total = self.output.get_total() for Index in range(self.s["Index"][0], self.s["Index"][1]): duplicates = self.manager.add_urls(Index, self.output) urls = self.manager.get_urls() bar = pyprind.ProgBar(self.s["Page"] - duplicates, title="Crawling " + "Page " + str(Index) + " ......") # 进度条 for url in urls: try: bar.update() html = self.downloader.download(url) data = self.parser.parse(html) self.output.insert_into_db(data) # 插入数据库 except Exception: continue new_total = self.output.get_total() self.output.close_cursor() # 关闭数据库连接 print("本次爬取", new_total - old_total, "条")
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 print("1") html = self.downloader.download(new_url) print("2") new_urls, data = self.parser.parser(new_url, html) print("3") # 将抽取的url添加到URL管理器中 self.manager.add_new_urls(new_urls) print("4") # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 def crawl(self): self.output.create_table() # 创建表 self.manager.add_new_urls() # 创建url total = self.manager.new_urls_size() bar = pyprind.ProgBar(30, title="Crawling......") # 进度条 while (self.manager.new_urls_size()): url = self.manager.get_new_url() html = self.downloader.download(url) data = self.parser.parse(html) errors, errors_messages = self.output.insert_into_db( data) # 插入数据库 bar.update() ''' sys.stdout.write( str(self.manager.old_urls_size() / total * 100) + "%") sys.stdout.flush() # print('爬取', self.manager.old_urls_size(), '条。') ''' self.output.close_cursor() # 关闭数据库连接 print("本次共爬取", total, "条") if errors: print("其中", errors, "条数据出错") print("错误:" + str(errors_messages))
def work(self, task_queue, result_queue, url_queue): downloader = HtmlDownloader() parser = HtmlParser() while True: while not task_queue.empty(): new_url = task_queue.get() if new_url == 'end': print('爬虫爬取完成') return print('获得新任务: %s' % new_url) response = downloader.download(new_url) items, next_page = parser.parser(response) url_queue.put(next_page) for item in items: print('任务完成: %s' % item) result_queue.put(item)
class SpiderMan(object): '''爬虫调度器 Attributes: manager: URL管理器 downloader: HTML下载器 parser: HTML解析器 output: 数据存储器 ''' def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): '''爬虫调度函数 Args: root_url: 爬虫入口URL Raises: Expection: 'NoneType' object has no attribute ''' self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('已经抓取了%s个链接' % self.manager.old_url_size()) except Exception as e: print('Crawl failed: %s' % e) self.output.output_html()
class Main(object): def __init__(self): self.download = HtmlDownloader() self.par = HtmlParser() def action(self, root_url): html = self.download.download(root_url) #通过解析器解析当前页面的url urls = self.par.parser_url(html) #遍历url并拼接url for url in urls: t = time.strftime("%Y%m%d%H%M%S2877", time.localtime()) new_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % ( url[0], t, url[1]) print new_url #新的url交给下载器下载 detail_html = self.download.download(new_url) print detail_html self.par.parser_json(detail_html)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() # 实例化时连接到数据库 self.s = Settings().setting
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
爬虫调度器(入口) ''' from URLManager import UrlManager from HTMLDownloader import HtmlDownloader from HTMLParser import HtmlParser from DataOutput import DataOutput from settings import Settings from random import random import time import datetime import threading import multiprocessing manager = UrlManager() downloader = HtmlDownloader() parser = HtmlParser() s = Settings().setting max_threads = 3 base_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" # 逐条爬取 def crawl(): db = DataOutput() # 连接到数据库 old_total = db.get_total() while db.has_unvisited(): docid = manager.get_one_docid(db) url = base_url + docid for _ in range(3): try:
def __init__(self): self.download = HtmlDownloader() self.par = HtmlParser()