def __init__( self, middleware: typing.Union[typing.Iterable, Middleware] = None, loop=None, is_async_start: bool = False, cancel_tasks: bool = True, **spider_kwargs, ): """ Init spider object. :param middleware: a list of or a single Middleware :param loop: asyncio event llo :param is_async_start: start spider by using async :param spider_kwargs """ if not self.start_urls or not isinstance( self.start_urls, collectionsAbc.Iterable ): raise ValueError( "Ruia spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.loop = loop asyncio.set_event_loop(self.loop) # Init object-level properties self.callback_result_map = self.callback_result_map or {} self.request_config = self.request_config or {} self.headers = self.headers or {} self.metadata = self.metadata or {} self.aiohttp_kwargs = self.aiohttp_kwargs or {} self.spider_kwargs = spider_kwargs self.request_config = self.request_config or {} self.request_session = ClientSession() self.cancel_tasks = cancel_tasks self.is_async_start = is_async_start # set logger self.logger = get_logger(name=self.name) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue as a producer self.request_queue = asyncio.Queue() # semaphore, used for concurrency control self.sem = asyncio.Semaphore(self.concurrency)
def __init__(self, middleware=None, loop=None, is_async_start=False): self.is_async_start = is_async_start self.logger = get_logger(name=self.name) self.loop = loop asyncio.set_event_loop(self.loop) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue self.request_queue = asyncio.Queue() # semaphore self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
def __init__(self, middleware=None, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError( "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.logger = get_logger(name=self.name) self.loop = loop or asyncio.new_event_loop() asyncio.set_event_loop(self.loop) # customize middleware if isinstance(middleware, list): self.middleware = reduce(lambda x, y: x + y, middleware) else: self.middleware = middleware or Middleware() # async queue self.request_queue = asyncio.Queue() # semaphore self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))