def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if ( not self.is_duplicate ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def run(self, config): self.initConfig(config) d = feedparser.parse(config.get('start_urls', '')[0]) # md5校验 last_md5 = toMd5(d.entries) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if OPEN_MD5_CHECK and self.last_md5 == last_md5: return True self.parse(d) # 解析rss syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})