Пример #1
0
 def work(self, task_queue, result_queue, url_queue):
     downloader = HtmlDownloader()
     parser = HtmlParser()
     while True:
         while not task_queue.empty():
             new_url = task_queue.get()
             print('获得新任务: %s' % new_url)
             response = downloader.download(new_url)
             items = parser.parser(response)
             if len(items) > 1:
                 for i in range(0, 60):
                     product_rate_url = items[i].get('product_rate_url')
                     print('获得链接:%s' % product_rate_url)
                     other_store_url = items[i].get('other_store_url')
                     print('获得链接:%s' % other_store_url)
                     url_queue.put(product_rate_url)
                     url_queue.put(other_store_url)
                     print('获取结果:%s' % str(items[i]))
                     result_queue.put(items[i])
                 next_page_url = items[-1]
                 if next_page_url == 'No next page':
                     print('已经爬取到最后一页,工作节点准备结束')
                     result_queue.put('end')
                     return
                 url_queue.put(next_page_url)
                 print('获得链接:%s' % next_page_url)
             else:
                 print('获取结果:%s' % str(items[0]))
                 result_queue.put(items[0])
Пример #2
0
 def work(self, task_queue, result_queue, url_queue):
     downloader = HtmlDownloader()
     parser = HtmlParser()
     while True:
         while not task_queue.empty():
             new_url = task_queue.get()
             if new_url == 'end':
                 print('爬虫爬取完成')
                 return
             print('获得新任务: %s' % new_url)
             response = downloader.download(new_url)
             items, next_page = parser.parser(response)
             url_queue.put(next_page)
             for item in items:
                 print('任务完成: %s' % item)
                 result_queue.put(item)
Пример #3
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()  # 实例化时连接到数据库
     self.s = Settings().setting
Пример #4
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Пример #5
0
爬虫调度器(入口)
'''
from URLManager import UrlManager
from HTMLDownloader import HtmlDownloader
from HTMLParser import HtmlParser
from DataOutput import DataOutput
from settings import Settings
from random import random
import time
import datetime
import threading
import multiprocessing

manager = UrlManager()
downloader = HtmlDownloader()
parser = HtmlParser()
s = Settings().setting
max_threads = 3
base_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID="


# 逐条爬取
def crawl():
    db = DataOutput()  # 连接到数据库
    old_total = db.get_total()
    while db.has_unvisited():
        docid = manager.get_one_docid(db)
        url = base_url + docid
        for _ in range(3):
            try:
Пример #6
0
 def __init__(self):
     self.download = HtmlDownloader()
     self.par = HtmlParser()