class SingleInheritor(TaskToSingleResultSpider): name = "single_inheritor_example" custom_settings = { "ITEM_PIPELINES": { get_import_full_name(ItemProducerPipeline): 310, } } def __init__(self, *args, **kwargs): super(SingleInheritor, self).__init__(*args, **kwargs) self.task_queue_name = f"{self.name}_task_queue" self.result_queue_name = f"{self.name}_result_queue" def next_request(self, _delivery_tag, msg_body): data = json.loads(msg_body) return scrapy.Request(data["url"], callback=self.parse) @rmq_callback def parse(self, response): meta_description = response.xpath( '//meta[@name="description"]/@content').get(default=None) yield MetaDescriptionItem({"description": meta_description}) @rmq_errback def _errback(self, failure): if failure.check(TunnelError): self.logger.info("TunnelError. Copy request") yield failure.request.copy() else: self.logger.warning(f"IN ERRBACK: {repr(failure)}")
def update_settings(cls, settings): spider_middlewares = settings.getdict("SPIDER_MIDDLEWARES") spider_middlewares[get_import_full_name( TaskTossSpiderMiddleware)] = 140 spider_middlewares[get_import_full_name( DeliveryTagSpiderMiddleware)] = 150 spider_extensions = settings.getdict("EXTENSIONS") spider_extensions[get_import_full_name(RPCTaskConsumer)] = 20 for custom_setting, value in (cls.custom_settings or {}).items(): if custom_setting == "SPIDER_MIDDLEWARES": spider_middlewares = {**spider_middlewares, **value} elif custom_setting == "EXTENSIONS": spider_extensions = {**spider_extensions, **value} else: settings.set(custom_setting, value) settings.set("SPIDER_MIDDLEWARES", spider_middlewares) settings.set("EXTENSIONS", spider_extensions)
def update_settings(cls, settings): cls.custom_settings: dict = cls.custom_settings or {} spider_middlewares: dict = cls.custom_settings.get( 'SPIDER_MIDDLEWARES', {}) spider_middlewares.update({ get_import_full_name(rmq_reader_middleware.RmqReaderMiddleware): 1 }) cls.custom_settings['SPIDER_MIDDLEWARES'] = spider_middlewares super().update_settings(settings)
def crawler(): settings = get_project_settings() custom_settings = { "DOWNLOADER_MIDDLEWARES": { get_import_full_name(Response301DownloaderMiddleware): 1, }, 'CONCURRENT_REQUESTS': 1, 'LOG_FILE': None, 'LOG_LEVEL': 'DEBUG', } settings.setdict(custom_settings or {}, priority='spider') yield CrawlerProcess(settings=settings)
class MySpider(RmqSpider): name = 'myspider' message_type: Type[BaseRmqMessage] = BaseRmqMessage task_queue_name: str = QUEUE_NAME custom_settings = { "DOWNLOADER_MIDDLEWARES": { get_import_full_name(CustomDownloaderMiddleware): 1, } } def parse(self, response, **kwargs): self.logger.info("PARSE METHOD") yield from () def next_request(self, message: BaseRmqMessage) -> Request: return Request('https://httpstat.us/200', dont_filter=True)