def parse_item(self, response): for xsel in response.xpath('//div[@class="bd"]/ul'): for sel in xsel.xpath(".//li"): item = NewscrawlerItem() item["title"] = sel.xpath(".//a[2]/text()").extract() item["description"] = get_key_words(item["title"][0]) item["url"] = sel.xpath(".//a[2]/@href").extract() date = sel.xpath(".//span/text()").extract() item["year"], item["month"], item["day"] = FixDate(date[0]) item["site"] = ["donews"] yield item
def parse(self, response): for sel in response.xpath('//ul[@id="newslist-all"]/li'): item = NewscrawlerItem() item["title"] = sel.xpath(".//h3/a/text()").extract() item["description"] = get_key_words(item["title"][0]) item["url"] = sel.xpath(".//h3/a/@href").extract() date = sel.xpath('.//div[@class="tag"]/span/text()').extract() item["year"], item["month"], item["day"] = FixDate(date[0]) item["site"] = ["pedaily"] yield item page_num = response.xpath('//*[@id="loadmore"]/@data-pageindex').extract()[0] page_realnum = int(page_num) + 1 url = "http://www.pedaily.cn/all/" + str(page_realnum) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): for sel in response.xpath('//div[@class="articles J_articleList"]/article'): item = NewscrawlerItem() item["title"] = sel.xpath(".//div/a/text()").extract() item["description"] = get_key_words(item["title"][0]) item["url"] = [u"http://36kr.com" + sel.xpath(".//div/a/@href").extract()[0]] if sel.xpath(".//div/div[1]/span/time/text()").extract() != []: date = sel.xpath(".//div/div[1]/span/time/@title").extract() else: date = sel.xpath(".//div/div[1]/span/abbr/@title").extract() item["year"], item["month"], item["day"] = FixDate(date[0]) item["site"] = ["36kr"] yield item return_url = ( "http://36kr.com" + response.xpath( '//div[@class="articles J_articleList"]/a[@class="load-more J_listLoadMore"]/@href' ).extract()[0] ) yield scrapy.Request(return_url, callback=self.parse)