示例#1
0
    def _start_engine(self):
        if ROLE == "master" or ROLE is None:  # 主执行start_request, 当然不是分布式时也需要执行start_request
            # 处理请求
            # self._start_requests()  # --->发送请求
            """__*** 1.异步非阻塞--->发送请求***__"""
            self.pool.apply_async(self._start_requests)

        # 处理调度器的请求
        # while True:
        # ---->执行请求 但并发的数量不能控制
        # self.pool.apply_async(self._excute_request_response_item())
        if ROLE == 'slave' or ROLE is None:  # slave端执行请求, 主从分离
            # 如何控制并发的次数?
            for i in range(ASNYC_MAX_COUNT):
                logger.info(u'子线程正在执行...')
                """__*** 2.异步非阻塞--->执行请求***__"""
                self.pool.apply_async(self._excute_request_response_item,
                                      callback=self._callback)

        while True:
            # 优化while True等待, 当网络响应慢的时候, 一个响应需要2秒, 那CPU就处在空转中
            # 通过测试这样优化后可以减轻cpu负担
            time.sleep(0.001)

            if self.total_response == self.scheduler.total_request and self.total_response != 0:
                self.is_running = False
                # total_response != 0 因为初始值是0, 程序没有开始就结束所以去除
                # 当请求数==响应数时断开
                break
        self.pool.close()  # 不在向线程池中添加任务了
        self.pool.join()  # 让主线程等待所有子线程执行结束

        logger.info(u"主线程执行结束")
示例#2
0
    def add_request(self, request):
        # 进来先判断request.filter(True(去重) / False(不去重))
        """增量爬虫"""
        if not request.filter:  # False不做去重so(不用添加指纹, 不用对指纹去重)
            logger.info(u"添加请求(dont filter) 成功: [{}] <{}>".format(request.method, request.url))
            self.queue.put(request)

            # 只要put了请求了这里就自增1
            self.total_request += 1
            return  # 到这里就结束不添加指纹, 不添加指纹

        # ***2.生成指纹(唯一性)
        fp = self._gen_fingerprint(request)

        # ***1. 对请求去重, 并添加不重复的请求到队列中
        if not self._filter_request(fp, request):
            logger.info(u"添加请求(not filter)成功:[{}]<{}>".format(request.method, request.url))
            self.queue.put(request)

            # put请求这里就自增1
            self.total_request += 1

            # add请求到队列中
            # self._filter_set.add(request.url)
            self._filter_set.add_fp(fp)  # 通过指纹来去重
示例#3
0
 def _filter_request(self, fp, request):
     # 判断是否是重复的请求, 如果是重复的返回True, 否则返回False
     # if request.url in self._filter_set:
     if fp in self._filter_set:
         logger.info(u"重复请求: [{}],<{}>".format(request.method, request.url))
         return True
     else:
         return False
示例#4
0
    def start(self):
        # 添加日信息, 记录程序的运行时间
        start = datetime.now()
        logger.info("start time{}:".format(start))

        # 启动engine
        self._start_engine()

        stop = datetime.now()
        logger.info("stop time{}:".format(stop))

        # 记录程序运行时间
        # total_seconds()  计算两个时间之间的总差
        logger.info("total time{}:".format((stop - start).total_seconds()))
示例#5
0
from scrapy_option.utils.log import logger
from datetime import datetime

# 5. 导入default_setting文件
from scrapy_option.conf.default_settings import *

# # 6. 导入线程, 创建线程池, 用法和进程相同
# from multiprocessing.dummy import Pool
#
# # 7. 导入协程的Pool, 导入重写后的协程就可以直接使用原有的线程池不需要在修改代码
# from scrapy_option.async.coroutine import Pool

# 8. 提供可选的多任务优化 线程 or 协成
if ASYNC_TYPE == "coroutine":
    from scrapy_plus. async .coroutine import Pool
    logger.info(u"正在启用协程异步模式")
elif ASYNC_TYPE == "thread":
    from multiprocessing.dummy import Pool
    logger.info(u"正在启用多线程异步模式")

else:
    raise Exception(u"不支持该异步类型")


class Engine(object):
    def __init__(self):
        # 创建初始化对象
        # 接收实际项目spider,
        self.spiders = self._auto_import_module_cls(SPIDERS, True)

        self.scheduler = Scheduler()
示例#6
0
 def process_item(self, item):
     # 处理item对象, 接受数据对象作为参数
     # print ("item:", item)
     # 将测试的数据输出到日志
     logger.info("item数据为:{}".format(item))