示例#1
0
 def parse_item(self, response):
     for xsel in response.xpath('//div[@class="bd"]/ul'):
         for sel in xsel.xpath(".//li"):
             item = NewscrawlerItem()
             item["title"] = sel.xpath(".//a[2]/text()").extract()
             item["description"] = get_key_words(item["title"][0])
             item["url"] = sel.xpath(".//a[2]/@href").extract()
             date = sel.xpath(".//span/text()").extract()
             item["year"], item["month"], item["day"] = FixDate(date[0])
             item["site"] = ["donews"]
             yield item
示例#2
0
    def parse(self, response):
        for sel in response.xpath('//ul[@id="newslist-all"]/li'):
            item = NewscrawlerItem()
            item["title"] = sel.xpath(".//h3/a/text()").extract()
            item["description"] = get_key_words(item["title"][0])
            item["url"] = sel.xpath(".//h3/a/@href").extract()
            date = sel.xpath('.//div[@class="tag"]/span/text()').extract()
            item["year"], item["month"], item["day"] = FixDate(date[0])
            item["site"] = ["pedaily"]
            yield item

        page_num = response.xpath('//*[@id="loadmore"]/@data-pageindex').extract()[0]
        page_realnum = int(page_num) + 1
        url = "http://www.pedaily.cn/all/" + str(page_realnum)
        yield scrapy.Request(url, callback=self.parse)
示例#3
0
    def parse(self, response):
        for sel in response.xpath('//div[@class="articles J_articleList"]/article'):
            item = NewscrawlerItem()
            item["title"] = sel.xpath(".//div/a/text()").extract()
            item["description"] = get_key_words(item["title"][0])
            item["url"] = [u"http://36kr.com" + sel.xpath(".//div/a/@href").extract()[0]]
            if sel.xpath(".//div/div[1]/span/time/text()").extract() != []:
                date = sel.xpath(".//div/div[1]/span/time/@title").extract()
            else:
                date = sel.xpath(".//div/div[1]/span/abbr/@title").extract()
            item["year"], item["month"], item["day"] = FixDate(date[0])
            item["site"] = ["36kr"]
            yield item

        return_url = (
            "http://36kr.com"
            + response.xpath(
                '//div[@class="articles J_articleList"]/a[@class="load-more J_listLoadMore"]/@href'
            ).extract()[0]
        )
        yield scrapy.Request(return_url, callback=self.parse)