def parse(self, response): _root = response.xpath("//tbody/tr") for _tr in _root: #帖子创建日期 type = _tr.xpath("th/em/a/text()").extract_first() title = _tr.xpath("th/a/text()").extract_first() cdate = _tr.xpath("td[@class='by']/em/span/text()").extract_first() if cdate is None: self.info("cdate is None") #self.debug(_tr.extract()) continue if timeutil.strdt_datetime(cdate,ft='%Y-%m-%d %H:%M') > self.last_date: try: uid = _tr.xpath("td[@class='by']/cite/a/@href").re("uid=(\d+)").pop() tid = _tr.xpath("th/a/@href").re("tid=(\d+)").pop() if uid is None or tid is None: self.info("title:%s %s;uid or tid is None" % (type,title)) #self.debug(_tr.extract()) continue _thread_url = self.url_pattern % {'tid':tid,'uid':uid} url = response.urljoin(_thread_url) #self.info("wait crawl url:%s from %s" % (url,response.url)) yield scrapy.Request(url, callback=self.parse_articles_follow_next_page,meta={'postdate':timeutil.strdt_datetime(cdate,ft='%Y-%m-%d %H:%M')}) except Exception as e: util.exc_info() self.debug(_tr.extract()) nextpage = response.xpath("//a[@class='nxt']/@href").extract_first() if nextpage is not None: url = response.urljoin(nextpage) self.info("list url->%s"% url) yield scrapy.Request(url, callback=self.parse)
def parse_articles_follow_next_page(self, response): _item = crawldata() _item['url'] = response.url _title = response.xpath("//span[@id='thread_subject']/text()").extract_first() _item['title'] = _title _tag = response.xpath("//h1[@class='ts']/a/text()").extract_first() _item['tag'] = _tag try: _item['postdate'] = response.meta['postdate'] except Exception as e: util.exc_info() _root = response.xpath("//div[@id='postlist']/div[starts-with(@id,'post_')]/table/tr/td[@class='plc']/div[@class='pct']/div[@class='pcb']/div[@class='t_fsz']") _message = [] for _root_item in _root: _second_root = _root_item.xpath("table/tr/td/child::node()") for _second_item in _second_root: _node_type = _second_item.xpath("name()").extract_first() if _node_type is None: _message.extend(_second_item.extract()) _message.append("\n") elif _node_type == "ignore_js_op": _img_url = _second_item.xpath("div//img/@file").extract_first() if _img_url is not None: _message.extend(response.urljoin(_img_url)) _message.append("\n") #抽取img,类似这种格式http://hzbike.com/forum.php?mod=viewthread&tid=118823&page=1&authorid=22591 _img_list = _root_item.xpath("div[@class='pattl']/ignore_js_op") for _img in _img_list: _img_url = _img.xpath(".//img/@file").extract_first() if _img_url is not None: _img_desc = _img.xpath(".//p[@class='mbn xg2']/text()").extract_first() if _img_desc is not None: _message.extend(_img_desc) _message.append("\n") _message.extend(response.urljoin(_img_url)) _message.append("\n") _item['data'] = "".join(_message).encode("utf8") yield _item next_page = response.xpath("//div[@class='pgt']/div[@class='pg']/a[@class='nxt']/@href") if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.parse_articles_follow_next_page)