Пример #1
0
    def parse_articles_follow_next_page(self, response):
        _item = crawldata()
        _item['data'] = response.body
        _item['url'] = response.url
        yield _item

        next_page = response.css("ul.navigation > li.next-page > a::attr('href')")
        if next_page:
            url = response.urljoin(next_page[0].extract())
            yield scrapy.Request(url, self.parse_articles_follow_next_page)
Пример #2
0
    def parse_articles_follow_next_page(self, response):
        _item = crawldata()
        _item['url'] = response.url
        
        _title = response.xpath("//span[@id='thread_subject']/text()").extract_first()
        _item['title'] = _title
        _tag = response.xpath("//h1[@class='ts']/a/text()").extract_first()
        _item['tag'] = _tag
        try:
            _item['postdate'] = response.meta['postdate']
        except Exception as e:
            util.exc_info()


        _root = response.xpath("//div[@id='postlist']/div[starts-with(@id,'post_')]/table/tr/td[@class='plc']/div[@class='pct']/div[@class='pcb']/div[@class='t_fsz']") 
        _message = []
        for _root_item in _root:
            _second_root = _root_item.xpath("table/tr/td/child::node()") 
            for _second_item in _second_root:
                _node_type = _second_item.xpath("name()").extract_first()
                if _node_type is None:
                    _message.extend(_second_item.extract())
                    _message.append("\n")
                elif _node_type == "ignore_js_op":
                    _img_url = _second_item.xpath("div//img/@file").extract_first()
                    if _img_url is not None:
                        _message.extend(response.urljoin(_img_url))
                        _message.append("\n")
 
            #抽取img,类似这种格式http://hzbike.com/forum.php?mod=viewthread&tid=118823&page=1&authorid=22591
            _img_list = _root_item.xpath("div[@class='pattl']/ignore_js_op")
            for _img in _img_list:
                _img_url = _img.xpath(".//img/@file").extract_first()
                if _img_url is not None:
                    _img_desc = _img.xpath(".//p[@class='mbn xg2']/text()").extract_first()
                    if _img_desc is not None:
                        _message.extend(_img_desc)
                        _message.append("\n")
                    _message.extend(response.urljoin(_img_url))
                    _message.append("\n")
        _item['data'] = "".join(_message).encode("utf8")
        yield _item

        next_page = response.xpath("//div[@class='pgt']/div[@class='pg']/a[@class='nxt']/@href")
        if next_page:
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, self.parse_articles_follow_next_page)