Пример #1
0
class Engine:
    def __init__(self, spider, middlewire=None, pipline: Piplines = None):
        self.lock = None
        self.reminder = reminder
        self.task_dict: Dict[str, asyncio.Task] = {}
        self.pip_task_dict: Dict[str, asyncio.Task] = {}
        self.spider = spider
        self.middlewire = middlewire
        self.piplines = pipline
        duplicate_filter_class = self._get_dynamic_class_setting(
            "duplicate_filter_class")
        scheduler_container_class = self._get_dynamic_class_setting(
            "scheduler_container_class")
        net_download_class = self._get_dynamic_class_setting(
            "net_download_class")
        self.scheduler = Scheduler(duplicate_filter_class(),
                                   scheduler_container_class())
        req_per_concurrent = self.spider.cutome_setting_dict.get(
            "req_per_concurrent") or gloable_setting_dict.get(
                "req_per_concurrent")
        self.downloader = Downloader(self.scheduler,
                                     self.middlewire,
                                     seq=req_per_concurrent,
                                     reminder=self.reminder,
                                     downer=net_download_class())
        self.request_generator_queue = deque()
        self.stop = False
        single = self.spider.cutome_setting_dict.get("is_single")
        self.is_single = gloable_setting_dict.get(
            "is_single") if single is None else single

        pipline_is_paralleled = self.spider.cutome_setting_dict.get(
            "pipline_is_paralleled")
        pipline_is_paralleled = gloable_setting_dict.get(
            "pipline_is_paralleled"
        ) if pipline_is_paralleled is None else pipline_is_paralleled
        self.pipline_is_paralleled = pipline_is_paralleled
        self.log = log

    def _get_dynamic_class_setting(self, key):
        class_str = self.spider.cutome_setting_dict.get(
            key) or gloable_setting_dict.get(key)
        _module = importlib.import_module(".".join(class_str.split(".")[:-1]))
        _class = getattr(_module, class_str.split(".")[-1])
        return _class

    def iter_request(self):
        while True:
            if not self.request_generator_queue:
                yield None
                continue
            request_generator = self.request_generator_queue[0]
            spider, real_request_generator = request_generator[
                0], request_generator[1]
            try:
                # execute and get a request from cutomer code
                # request=real_request_generator.send(None)
                request_or_item = next(real_request_generator)
            except StopIteration:
                self.request_generator_queue.popleft()
                continue
            except Exception as e:
                # 可以处理异常
                self.request_generator_queue.popleft()
                self._handle_exception(spider, e)
                continue
            yield request_or_item

    def _check_complete_pip(self, task):
        if task.cancelled():
            self.log.debug(f" a task canceld ")
            return
        if task and task.done() and task._key:
            if task.exception():
                self.log.error(
                    f"a task  occurer error in pipline {task.exception()}  ")
            else:
                self.log.debug(f"a task done  ")
                result = task.result()
                if result and isinstance(result, Item):
                    if hasattr(task, '_index'):
                        self._hand_piplines(task._spider, result,
                                            task._index + 1)
            self.pip_task_dict.pop(task._key)

    def _check_complete_callback(self, task):
        if task.cancelled():
            self.log.debug(f" a task canceld ")
            return
        if task and task.done() and task._key:
            self.log.debug(f"a task done  ")
            self.task_dict.pop(task._key)

    async def start(self):
        self.spider.on_start()
        self.request_generator_queue.append((self.spider, iter(self.spider)))

        self.reminder.go(Reminder.spider_start, self.spider)
        self.reminder.go(Reminder.engin_start, self)
        # core  implenment
        while not self.stop:
            # paused
            if self.lock and self.lock.locked():
                await asyncio.sleep(1)
                continue
            if not self.is_single:
                if len(self.task_dict) > 2000:
                    await asyncio.sleep(0.5)

            request_or_item = next(self.iter_request())
            if isinstance(request_or_item, Request):
                self.scheduler.schedlue(request_or_item)

            if isinstance(request_or_item, Item):
                self._hand_piplines(self.spider, request_or_item)

            request = self.scheduler.get()

            if isinstance(request, Request):
                request.__spider__ = self.spider
                self._ensure_future(request)

            can_stop = self._check_can_stop(request)
            if can_stop:
                # there is no request and the task has been completed.so ended
                self.log.debug(
                    f" here is no request and the task has been completed.so  engine will stop .."
                )
                self.stop = True
                break

            resp = self.downloader.get()

            if resp is None:
                # let the_downloader can be scheduled, test 0.001-0.0006 is better
                await asyncio.sleep(0.001)
                continue

            custome_callback = resp.request.callback
            if custome_callback:
                request_generator = custome_callback(resp)
                if request_generator:
                    self.request_generator_queue.append(
                        (custome_callback.__self__, request_generator))
                    # self.request_generator_queue.append( request_generator)
            if self.spider.state != "runing":
                self.spider.state = "runing"

        self.spider.state = "closed"
        self.reminder.go(Reminder.spider_close, self.spider)
        self.spider.on_close()
        # wait some resource to freed
        self.reminder.go(Reminder.engin_close, self.spider)
        await asyncio.sleep(0.3)
        self.log.debug(f" engine stoped..")

    def pause(self):
        self.log.info(f" out called pause.. so engine will pause.. ")
        asyncio.create_task(self._lock())
        self.spider.state = "pause"

    def recover(self):
        if self.lock and self.lock.locked():
            self.log.info(f" out called recover.. so engine will recover.. ")
            self.lock.release()

    def close(self):
        # can make external active end engine
        self.stop = True
        tasks = asyncio.all_tasks()
        for it in tasks:
            it.cancel()
        asyncio.gather(*tasks, return_exceptions=True)
        self.log.debug(f" out called stop.. so engine close.. ")

    async def _lock(self):
        if self.lock is None:
            self.lock = Lock()
        await self.lock.acquire()

    def _ensure_future(self, request: Request):
        # compatible py_3.6
        task = asyncio.ensure_future(self.downloader.download(request))
        key = str(uuid.uuid4())
        task._key = key
        self.task_dict[key] = task
        task.add_done_callback(self._check_complete_callback)

    def _handle_exception(self, spider, e):
        if spider:
            try:
                self.log.error(f"  occured exceptyion e {e} ", exc_info=True)
                spider.on_exception_occured(e)
            except BaseException:
                pass

    def _check_can_stop(self, request):
        if request:
            return False
        if len(self.task_dict) > 0:
            return False
        if len(self.pip_task_dict) > 0:
            return False
        if len(self.request_generator_queue
               ) > 0 and self.scheduler.scheduler_container.size() > 0:
            return False
        if self.downloader.response_queue.qsize() > 0:
            return False
        if self.scheduler.scheduler_container.size() > 0:
            return False
        start = time.time()
        while not self.is_single:
            end = time.time()
            if (end - start) > 10.0:
                print("空转 超过10s 停止")
                break
            if self.scheduler.scheduler_container.size() <= 0:
                time.sleep(0.05)
            else:
                return False
        return True

    def _hand_piplines(self, spider_ins, item, index=0):
        if self.piplines is None or len(self.piplines.piplines) <= 0:
            self.log.info(
                "get a item but can not  find a piplinse to handle it so ignore it "
            )
            return

        if len(self.piplines.piplines) < index + 1:
            return

        pip = self.piplines.piplines[index][1]

        if not callable(pip):
            return

        if not inspect.iscoroutinefunction(pip):
            task = asyncio.get_running_loop().run_in_executor(
                None, pip, spider_ins, item)
        else:
            task = asyncio.ensure_future(pip(spider_ins, item))
        key = str(uuid.uuid4())
        task._key = key
        task._index = index
        task._spider = spider_ins
        self.pip_task_dict[key] = task
        task.add_done_callback(self._check_complete_pip)
Пример #2
0
class Engine:
    def __init__(self, spider, middlewire=None, pipline: Piplines = None):
        self.reminder = reminder
        self.log = log
        self.lock = None
        self.task_dict: Dict[str, Any] = {}
        self.pip_task_dict: Dict[str, asyncio.Task] = {}
        self.spider = spider
        self.middlewire = middlewire
        self.piplines = pipline
        duplicate_filter_class = self._get_dynamic_class_setting(
            "duplicate_filter_class")
        scheduler_container_class = self._get_dynamic_class_setting(
            "scheduler_container_class")
        net_download_class = self._get_dynamic_class_setting(
            "net_download_class")
        scheduler_class = self._get_dynamic_class_setting("scheduler_class")
        self.scheduler = scheduler_class(duplicate_filter_class(),
                                         scheduler_container_class())
        req_per_concurrent = self.spider.cutome_setting_dict.get(
            "req_per_concurrent") or gloable_setting_dict.get(
                "req_per_concurrent")
        single = self.spider.cutome_setting_dict.get("is_single")
        self.is_single = gloable_setting_dict.get(
            "is_single") if single is None else single
        self.downloader = Downloader(self.scheduler,
                                     self.middlewire,
                                     reminder=self.reminder,
                                     seq=req_per_concurrent,
                                     downer=net_download_class())
        self.request_generator_queue = deque()
        self.stop = False
        self.condition = asyncio.Condition()
        self.item_queue = asyncio.Queue()
        pipline_is_paralleled = self.spider.cutome_setting_dict.get(
            "pipline_is_paralleled")
        pipline_is_paralleled = gloable_setting_dict.get(
            "pipline_is_paralleled"
        ) if pipline_is_paralleled is None else pipline_is_paralleled
        self.pipline_is_paralleled = pipline_is_paralleled

        self.lock1 = asyncio.Lock()
        self.lock2 = asyncio.Lock()

    def _get_dynamic_class_setting(self, key):
        class_str = self.spider.cutome_setting_dict.get(
            key) or gloable_setting_dict.get(key)
        _module = importlib.import_module(".".join(class_str.split(".")[:-1]))
        _class = getattr(_module, class_str.split(".")[-1])
        self.log.info(
            f"dynamic loaded  key【{key}】--> class【{class_str}】success")
        return _class

    def iter_request(self):
        while True:
            if not self.request_generator_queue:
                yield None
                continue
            request_generator = self.request_generator_queue[0]
            spider, real_request_generator = request_generator[
                0], request_generator[1]
            try:
                request_or_item = next(real_request_generator)
            except StopIteration:
                self.request_generator_queue.popleft()
                continue
            except Exception as e:
                # 可以处理异常
                self.request_generator_queue.popleft()
                self._handle_exception(spider, e)
                self.reminder.go(Reminder.spider_execption,
                                 spider,
                                 exception=e)
                continue
            yield request_or_item

    def _check_complete_pip(self, task):
        if task.cancelled():
            self.log.debug(f" a task canceld ")
            return
        if task and task.done() and task._key:
            if task.exception():
                self.log.error(
                    f"a task  occurer error in pipline {task.exception()}  ")
            else:
                self.log.debug(f"a task done  ")
                result = task.result()
                if result and isinstance(result, Item):
                    if hasattr(task, '_index'):
                        self._hand_piplines(task._spider,
                                            result,
                                            index=task._index + 1,
                                            paralleled=False)
            self.pip_task_dict.pop(task._key)

    def _check_complete_callback(self, task):
        if task.cancelled():
            self.log.debug(f" a task canceld ")
            return
        if task and task.done() and task._key:
            self.log.debug(f"a task done  ")
            self.task_dict.pop(task._key)

    async def start(self):
        self.spider.on_start()
        self.reminder.go(Reminder.spider_start, self.spider)
        self.reminder.go(Reminder.engin_start, self)

        self.request_generator_queue.append((self.spider, iter(self.spider)))
        handle_items = [
            asyncio.ensure_future(self.handle_item()) for _ in range(50)
        ]
        works = [asyncio.ensure_future(self.work()) for _ in range(50)]
        while not self.stop:
            # 是否暂停了
            if self.lock and self.lock.locked():
                await asyncio.sleep(0.5)
                continue
            # 若是分布式爬虫 让内存里的任务不过堆积过多 尽量均分给其他机器
            if self.is_single:
                if len(self.task_dict) > 1500:
                    await asyncio.sleep(0.3)
            waited, wait = False, None
            user_func_res = next(self.iter_request())
            if isinstance(user_func_res, Request):
                wait = self.scheduler.schedlue(user_func_res)

            if isinstance(user_func_res, (dict, Item)):
                wait = self.item_queue.put(user_func_res)
            if wait is not None and inspect.isawaitable(wait):
                waited = True
                await wait
            if not waited:
                await asyncio.sleep(0.001)
            can_stop = await self._check_can_stop(None)
            if can_stop:
                # there is no request and the task has been completed.so ended
                self.log.debug(
                    " here is no request and the task has been completed.so  engine will stop .."
                )
                self.stop = True
                break
            if self.spider.state != "runing":
                self.spider.state = "runing"

            # if not waited:
            #     await asyncio.sleep(0.001)
            # if user_func_res is None or not wait:
            #     await self.lock1.acquire()

        self.spider.state = "closed"
        self.reminder.go(Reminder.spider_close, self.spider)
        self.spider.on_close()
        for _t in works + handle_items:
            _t.cancel()
        self.reminder.go(Reminder.engin_close, self)
        self.log.debug(f" engine stoped..")

    async def work(self):
        seed = False
        while not self.stop:
            waited = False
            request = self.scheduler.get()
            if inspect.isawaitable(request):
                waited = True
                request = await request
            resp = None
            if isinstance(request, Request):
                setattr(request, "__spider__", self.spider)
                self.reminder.go(Reminder.request_scheduled, request)
                # if waited:
                #     resp = await self._ensure_future_special(request)
                # else:
                #     self._ensure_future(request)
                if waited:
                    resp = await self._ensure_future_special(request)
                else:
                    self._ensure_future(request)
            _resp = self.downloader.get()
            if not resp:
                resp = _resp

            if resp is None:
                if not waited:
                    await asyncio.sleep(0.004)
                # let the_downloader can be scheduled, test 0.001-0.0006 is better
                # uniform = random.uniform(0.0001, 0.006)
                if not seed:
                    await asyncio.sleep(0.03)
                seed = not seed
                continue
            custome_callback = resp.request.callback
            if custome_callback:
                request_generator = custome_callback(resp)
                if request_generator:
                    self.request_generator_queue.append(
                        (custome_callback.__self__, request_generator))

    @staticmethod
    async def cancel_all_tasks():
        tasks = []
        for task in asyncio.Task.all_tasks():
            if task is not asyncio.tasks.Task.current_task():
                tasks.append(task)
                task.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)

    def pause(self):
        # self.log.info(f" out called pause.. so engine will pause.. ")
        asyncio.create_task(self._lock())
        self.spider.state = "pause"

    def recover(self):
        if self.lock and self.lock.locked():
            # self.log.info(f" out called recover.. so engine will recover.. ")
            self.lock.release()

    def close(self):
        # can make external active end engine
        self.stop = True
        tasks = asyncio.all_tasks()
        for it in tasks:
            it.cancel()
        asyncio.gather(*tasks, return_exceptions=True)
        self.log.debug(f" out called stop.. so engine close.. ")

    async def _lock(self):
        if self.lock is None:
            self.lock = Lock()
        await self.lock.acquire()

    def _ensure_future(self, request: Request):
        # compatible py_3.6
        task = asyncio.ensure_future(self.downloader.download(request))
        key = str(uuid.uuid4())
        task._key = key
        self.task_dict[key] = task
        task.add_done_callback(self._check_complete_callback)

    async def _ensure_future_special(self, request: Request):
        # compatible py_3.6
        key = str(uuid.uuid4())
        self.task_dict[key] = request
        resp = await self.downloader.download(request)
        self.task_dict.pop(key)
        return resp

    def _handle_exception(self, spider, e):
        if spider:
            try:
                self.log.error(f"  occured exceptyion e {e} ", exc_info=True)
                spider.on_exception_occured(e)
            except BaseException:
                pass

    async def _check_can_stop(self, request):
        if request:
            return False
        if len(self.task_dict) > 0:
            return False
        if len(self.pip_task_dict) > 0:
            return False
        if len(self.request_generator_queue) > 0:
            return False
        if self.item_queue.qsize() > 0:
            return False
        if self.downloader.response_queue.qsize() > 0:
            return False
        size = self.scheduler.scheduler_container.size()
        if inspect.isawaitable(size):
            size = await size
        if len(self.request_generator_queue) > 0 and size > 0:
            return False
        if size > 0:
            return False
        start = time.time()
        self.reminder.go(Reminder.engin_idle, self)
        while not self.is_single:
            end = time.time()
            period = 5
            if (end - start) > period:
                self.log.info(f"empty loop {period}s second so stop")
                break
            if self.scheduler.scheduler_container.size() <= 0:
                time.sleep(0.1)
            else:
                return False
        self.log.info("craw end ,engin will stop")
        return True

    def _run_pip_async(self, pip, spider_ins, item, index, paralleled=False):
        if not inspect.iscoroutinefunction(pip):
            task = asyncio.get_running_loop().run_in_executor(
                None, pip, spider_ins, item)
        else:
            task = asyncio.ensure_future(pip(spider_ins, item))
        key = str(uuid.uuid4())
        task._key = key
        if not paralleled:
            task._index = index
        task._spider = spider_ins
        self.pip_task_dict[key] = task
        task.add_done_callback(self._check_complete_pip)

    def _hand_piplines(self, spider_ins, item, index=0, paralleled=False):
        if self.piplines is None or len(self.piplines.piplines) <= 0:
            self.log.debug(
                "get a item but can not  find a piplinse to handle it so ignore it "
            )
            return
        if paralleled:
            for order, pip in self.piplines.piplines:
                if not callable(pip):
                    continue
                self._run_pip_async(pip, spider_ins, item, index, paralleled)
        else:
            if not paralleled and len(self.piplines.piplines) < index + 1:
                return
            pip = self.piplines.piplines[index][1]
            if not callable(pip):
                return
            self._run_pip_async(pip, spider_ins, item, index, paralleled)

    async def handle_item(self):
        while not self.stop:
            if self.item_queue.qsize() <= 0:
                await asyncio.sleep(0.2)

            item = await self.item_queue.get()

            self.item_queue.task_done()
            # item = self.item_queue.get_nowait()
            self._hand_piplines(self.spider,
                                item,
                                paralleled=self.pipline_is_paralleled)