示例#1
0
 def page(self, response):
     item = NewsItem()
     for new in response.xpath('//div[@class="article-content"]//a'):
         item['title'] = new.xpath('./text()').extract()[0]
         item['link'] = new.xpath('./@href').extract()[0]
         item['site'] = 'jiemian'
         yield item
示例#2
0
 def page(self, response):
     item = NewsItem()
     for new in response.xpath(
             '//div[@class="article-main"]//h3/text()').extract():
         item['title'] = new
         item['link'] = response.url
         item['site'] = 'jiemian'
         yield item
示例#3
0
 def parse(self, response):
     item = NewsItem()
     for new in response.xpath('//div[@class="post-list view-content"]/div/div'):
         if new.xpath('.//span[@class="datestamp"]/text()').extract()[0] != time.strftime("%d/%m/%Y", time.localtime()):
             break
         item['title'] = new.xpath('.//span[@class="post-title"]/text()').extract()[0]
         item['link'] = 'https://www.zaobao.com' + new.xpath('.//a/@href').extract()[0]
         item['site'] = 'zaobao'
         yield item
示例#4
0
文件: rif.py 项目: TonyBay/daily_news
 def parse(self, response):
     item = NewsItem()
     for new in response.xpath('//ul[@id="articleList"]/li'):
         if new.xpath('.//time/@datetime').extract()[0] != time.strftime(
                 "%y-%m-%d", time.localtime()):
             break
         item['title'] = new.xpath('.//h3/a/@title').extract()[0]
         item['link'] = 'http://cn.rfi.fr' + new.xpath(
             './/h3/a/@href').extract()[0]
         item['site'] = 'rfi'
         yield item
示例#5
0
文件: wsc.py 项目: TonyBay/daily_news
 def page(self, response):
     item = NewsItem()
     for new in response.xpath('//div[@class="rich-text"]/p[@align]//a'):
         title = ''
         for text in new.xpath('.//text()').extract():
             title += text
         item['title'] = title
         try:
             item['link'] = new.xpath('./@href').extract()[0]
         except:
             item['link'] = response.url
         item['site'] = 'wsc'
         yield item
示例#6
0
 def parse(self, response):
     html = response.text
     doc = Document(html)
     summary = doc.summary()
     title = doc.title()
     print(summary)
     text = removeTags(summary)
     contenPath = saveToText(title, text)
     # saveToMongodb(title,contenPath)
     item = NewsItem()
     item['title'] = title
     item['url'] = response.url
     item['contenPath'] = contenPath
     yield item
示例#7
0
 def parse(self, response):
     item = NewsItem()
     for new in response.xpath('//div[@class="b-plainlist__info"]'):
         item['title'] = new.xpath('./h2/a/text()').extract()[0]
         item['link'] = 'http://sputniknews.cn' + new.xpath(
             './h2/a/@href').extract()[0]
         item['site'] = 'sputnik'
         yield item
     try:
         yield scrapy.Request(url='http://sputniknews.cn' + response.xpath(
             '//div[@class="b-more"]/a/@data-href').extract()[0],
                              callback=self.parse)
     except:
         pass
示例#8
0
 def page(self, response):
     item = NewsItem()
     for new in response.xpath('//div[@class="news_li"]'):
         if new.re(r'<span class="trbszan">'):
             if new.re(r'[0-9]{1,2}小时前|[0-9]{1,2}分钟前'):
                 item['title'] = new.xpath('.//h2/a/text()').extract()[0]
                 item['link'] = 'https://www.thepaper.cn/' + new.xpath(
                     './/a/@href').extract()[0]
                 item['site'] = 'thepaper'
                 yield item
     if len(response.selector.re(r'[0-9]{1,2}小时前|[0-9]{1,2}分钟前')) == 20:
         yield scrapy.Request(
             url=response.url[:-1] + str(int(response.url[-1]) + 1),
             callback=self.page,
         )
示例#9
0
 def parse(self, response):
     # self.logger.info(response.status)
     # if response.status == 200:
     #     print("返回码"+str(response.status))
     #     return
     html = response.text
     doc = Document(html)
     summary = doc.summary()
     title = doc.title()
     # print(summary)
     text = removeTags(summary)
     contenPath = saveToText(title, text)
     item = NewsItem()
     item['title'] = title
     item['url'] = response.url
     item['contenPath'] = contenPath
     yield item