Пример #1
0
    def parse(self, response):
        sel = Selector(response)

        #items = []
        #获得文章url和标题
        item = CsdnblogItem()

        article_url = str(response.url)
        article_name = sel.xpath(
            '//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['article_name'] = [n.encode('utf-8') for n in article_name]
        item['article_url'] = article_url.encode('utf-8')

        yield item

        #获得下一篇文章的url
        urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()
        # for url in urls:
        #     print url
        #     url = "http://blog.csdn.net" + url
        #     print url
        #     yield Request(url, callback=self.parse)

        print '-------------------------------xxy--', urls[0]
        url = "http://blog.csdn.net" + urls[0]
        print url
        yield Request(url, callback=self.parse)
Пример #2
0
    def parse(self, response):
        sel = Selector(response)

        item = CsdnblogItem()
        article_url = str(response.url)
        article_name = sel.xpath(
            '//div[@class="article_title"]/h1/span/a/text()').extract()

        item['article_name'] = [n.encode('utf-8') for n in article_name]
        item['article_url'] = article_url.encode('utf-8')
        yield item

        urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()

        for url in urls:
            url = "http://blog.csdn.net" + url
            yield Request(url, callback=self.parse)
Пример #3
0
    def parse(self, response):
        sel = Selector(response)

        # items = []
        # 获得文章url和标题
        item = CsdnblogItem()

        article_url = str(response.url)
        article_name = sel.xpath('//h1[@class="title-article"]/text()').extract_first()

        item['article_name'] = article_name
        item['article_url'] = article_url

        yield item

        # 获得下一篇文章的url
        url = response.xpath('//div[@class="tool-box"]/ul/li[last()-1]/a/@href').extract_first()
        if(url != None):
            yield Request(url, callback=self.parse)
Пример #4
0
    def parse(self, response):
        sel = Selector(response)

        #items = []
        #获得文章url和标题
        item = CsdnblogItem()

        article_url = str(response.url)
        article_name = sel.xpath('//h1/text()').extract()

        item['article_name'] = [n.encode('utf-8') for n in article_name]
        item['article_url'] = article_url.encode('utf-8')

        yield item

        #获得下一篇文章的url
        urls = sel.xpath('//div[@class="nav-previous"]/a/@href').extract()

        for url in urls:
            print url
            yield Request(url, callback=self.parse)