Пример #1
0
    def __init__(
        self,
        middleware: typing.Union[typing.Iterable, Middleware] = None,
        loop=None,
        is_async_start: bool = False,
        cancel_tasks: bool = True,
        **spider_kwargs,
    ):
        """
        Init spider object.
        :param middleware: a list of or a single Middleware
        :param loop: asyncio event llo
        :param is_async_start: start spider by using async
        :param spider_kwargs
        """
        if not self.start_urls or not isinstance(
            self.start_urls, collectionsAbc.Iterable
        ):
            raise ValueError(
                "Ruia spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']"
            )

        self.loop = loop
        asyncio.set_event_loop(self.loop)

        # Init object-level properties
        self.callback_result_map = self.callback_result_map or {}

        self.request_config = self.request_config or {}
        self.headers = self.headers or {}
        self.metadata = self.metadata or {}
        self.aiohttp_kwargs = self.aiohttp_kwargs or {}
        self.spider_kwargs = spider_kwargs
        self.request_config = self.request_config or {}
        self.request_session = ClientSession()

        self.cancel_tasks = cancel_tasks
        self.is_async_start = is_async_start

        # set logger
        self.logger = get_logger(name=self.name)

        # customize middleware
        if isinstance(middleware, list):
            self.middleware = reduce(lambda x, y: x + y, middleware)
        else:
            self.middleware = middleware or Middleware()

        # async queue as a producer
        self.request_queue = asyncio.Queue()

        # semaphore, used for concurrency control
        self.sem = asyncio.Semaphore(self.concurrency)
Пример #2
0
 def __init__(self, middleware=None, loop=None, is_async_start=False):
     self.is_async_start = is_async_start
     self.logger = get_logger(name=self.name)
     self.loop = loop
     asyncio.set_event_loop(self.loop)
     # customize middleware
     if isinstance(middleware, list):
         self.middleware = reduce(lambda x, y: x + y, middleware)
     else:
         self.middleware = middleware or Middleware()
     # async queue
     self.request_queue = asyncio.Queue()
     # semaphore
     self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
Пример #3
0
 def __init__(self, middleware=None, loop=None):
     if not self.start_urls or not isinstance(self.start_urls, list):
         raise ValueError(
             "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']"
         )
     self.logger = get_logger(name=self.name)
     self.loop = loop or asyncio.new_event_loop()
     asyncio.set_event_loop(self.loop)
     # customize middleware
     if isinstance(middleware, list):
         self.middleware = reduce(lambda x, y: x + y, middleware)
     else:
         self.middleware = middleware or Middleware()
     # async queue
     self.request_queue = asyncio.Queue()
     # semaphore
     self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))