def processing_products(self, spider): if self != spider or self.skus_parsed: return request = Request(self.start_urls[0], callback=self.yield_products, dont_filter=True) self.crawler.engine.crawl(request, spider) raise DontCloseSpider()
def spider_idle(self, spider): for request in self._read_queue(): self.crawler.engine.crawl(request, self) raise DontCloseSpider()
def spider_idle(cls, spider): if cls.requests.get(spider): spider.log("delayed requests pending, not closing spider") raise DontCloseSpider()
def keep_spider_alive(self, spider): raise DontCloseSpider("keeping it open")
def _spider_idle(self): self._schedule_next_request() raise DontCloseSpider()
def _logout(self, spider): if spider != self: return if self.logout_done: return if not self.logout_url: return self.crawler.engine.schedule(self.logout(), spider) raise DontCloseSpider('logout scheduled')
def spider_idle(self, spider): if spider != self: return if self.items: self.crawler.engine.schedule(Request('http://' + self.allowed_domains[0], callback=self.yield_product, dont_filter=True), spider) raise DontCloseSpider('Found pending requests')
def spider_idle(self, spider): raise DontCloseSpider('waiting for process')
def parse_article(self, response): comment_item = CommentItem() post_item = PostItem() # define the next page url try: next_page = response.xpath( "//div[@id='action-bar-container']//a[contains(text(), '上頁')]/@href" )[0] except: next_page = None logging.error('Cannot load the next page') # see the number of articles try: article_list = response.css('.r-list-container > div') except: logging.error('There is not exist any article being loaded') else: logging.info('the number of articles is {}'.format( len(article_list))) # check all tags of div while len(article_list) > 0: div = article_list.pop(0) try: # get class name inside div tag slot_name = div.xpath('@class')[0].extract() # get url inside article respectively canonicalUrl = response.urljoin( div.css('.title a::attr(href)')[0].extract()) # get author inside article respectively author_str = div.css('.author::text')[0].extract() except: logging.error('Fail to access url, author and classes') else: if slot_name == 'r-list-sep': ''' Once receiving class='r-list-sep', we are going to crawl the next page. ''' if next_page: # exists the next page logging.warning('redirect to following {}'.format( canonicalUrl)) yield scrapy.Request(canonicalUrl, callback=self.parse_article, dont_filter=True) else: # there is no following page, we stop the spider and wait for a new request raise CloseSpider('page exceeded') break # Then we needn't search the following else: # r-ent' or 'search-bar': ''' load each article url and find information such as content, comment, author, etc.. ''' if slot_name != 'r-ent': continue # 'search-bar' ## try to access articles which match the period we defined date_str = div.css('.date::text')[0].extract() m_d = tuple(map(int, date_str.split('/'))) m_d = datetime(self.year, m_d[0], m_d[1]) in_period = self.start <= m_d <= self.end logging.debug( 'the date is {}/{}, and reach or not {}'.format( m_d.month, m_d.day, in_period)) if in_period: # First get items in an article before loading url inside try: post_item['canonicalUrl'] = canonicalUrl comment_item['canonicalUrl'] = canonicalUrl post_item['authorId'] = author_str post_item['title'] = div.css( '.title a::text')[0].extract() post_item['publishedTime'] = date_str post_item['board'] = self.board except IndexError: logging.error('Fail to save object (postItem)') # This part we try to get in url corresponding to article try: url = response.urljoin( div.css('.title a::attr(href)') [0].extract()) logging.info( 'load url inside every article:{}'.format( url)) yield scrapy.Request( url, meta={ 'post': post_item, 'comment': comment_item }, callback=self.parse_comment, dont_filter=True) except IndexError: logging.error('Cannot load article') else: self.maximum_missing_count -= 1 if len(article_list) == 0: ''' Let's go to the next page since we have explored every article ''' if next_page and self.maximum_missing_count > 0: # exists the next page url = response.urljoin(next_page.extract()) logging.warning('redirect to following {}'.format(url)) yield scrapy.Request(url, callback=self.parse_article, dont_filter=True) else: # there is no following page, we stop the spider and wait for a new request logging.error('Without the next page') # endless running raise DontCloseSpider('page exceeded')
def _dont_close_me(self, spider): raise DontCloseSpider("..I prefer live spiders.")
def spider_idle(self, spider): """当spider收到idle信号时, 去ZooKeeper上获取一个任务, 再抛出DontCloseSpider异常""" self.request_new_job(spider) raise DontCloseSpider()
def signal_dispatcher(self, signal): if self.signals_callback: if signal == signals.spider_idle or signal == signals.spider_error: raise DontCloseSpider('I prefer live spiders') elif signal == signals.spider_opened: self.signals_callback(signal, spider=self)
def leftover_requests(self, spider): if self.resume_iter: self.defrost_in_batch(spider) raise DontCloseSpider()
def spider_idle(self, spider): if self.rountine_interval: self.schedule_rountine_requests(spider) raise DontCloseSpider()