def page(self, response): item = NewsItem() for new in response.xpath('//div[@class="article-content"]//a'): item['title'] = new.xpath('./text()').extract()[0] item['link'] = new.xpath('./@href').extract()[0] item['site'] = 'jiemian' yield item
def page(self, response): item = NewsItem() for new in response.xpath( '//div[@class="article-main"]//h3/text()').extract(): item['title'] = new item['link'] = response.url item['site'] = 'jiemian' yield item
def parse(self, response): item = NewsItem() for new in response.xpath('//div[@class="post-list view-content"]/div/div'): if new.xpath('.//span[@class="datestamp"]/text()').extract()[0] != time.strftime("%d/%m/%Y", time.localtime()): break item['title'] = new.xpath('.//span[@class="post-title"]/text()').extract()[0] item['link'] = 'https://www.zaobao.com' + new.xpath('.//a/@href').extract()[0] item['site'] = 'zaobao' yield item
def parse(self, response): item = NewsItem() for new in response.xpath('//ul[@id="articleList"]/li'): if new.xpath('.//time/@datetime').extract()[0] != time.strftime( "%y-%m-%d", time.localtime()): break item['title'] = new.xpath('.//h3/a/@title').extract()[0] item['link'] = 'http://cn.rfi.fr' + new.xpath( './/h3/a/@href').extract()[0] item['site'] = 'rfi' yield item
def page(self, response): item = NewsItem() for new in response.xpath('//div[@class="rich-text"]/p[@align]//a'): title = '' for text in new.xpath('.//text()').extract(): title += text item['title'] = title try: item['link'] = new.xpath('./@href').extract()[0] except: item['link'] = response.url item['site'] = 'wsc' yield item
def parse(self, response): html = response.text doc = Document(html) summary = doc.summary() title = doc.title() print(summary) text = removeTags(summary) contenPath = saveToText(title, text) # saveToMongodb(title,contenPath) item = NewsItem() item['title'] = title item['url'] = response.url item['contenPath'] = contenPath yield item
def parse(self, response): item = NewsItem() for new in response.xpath('//div[@class="b-plainlist__info"]'): item['title'] = new.xpath('./h2/a/text()').extract()[0] item['link'] = 'http://sputniknews.cn' + new.xpath( './h2/a/@href').extract()[0] item['site'] = 'sputnik' yield item try: yield scrapy.Request(url='http://sputniknews.cn' + response.xpath( '//div[@class="b-more"]/a/@data-href').extract()[0], callback=self.parse) except: pass
def page(self, response): item = NewsItem() for new in response.xpath('//div[@class="news_li"]'): if new.re(r'<span class="trbszan">'): if new.re(r'[0-9]{1,2}小时前|[0-9]{1,2}分钟前'): item['title'] = new.xpath('.//h2/a/text()').extract()[0] item['link'] = 'https://www.thepaper.cn/' + new.xpath( './/a/@href').extract()[0] item['site'] = 'thepaper' yield item if len(response.selector.re(r'[0-9]{1,2}小时前|[0-9]{1,2}分钟前')) == 20: yield scrapy.Request( url=response.url[:-1] + str(int(response.url[-1]) + 1), callback=self.page, )
def parse(self, response): # self.logger.info(response.status) # if response.status == 200: # print("返回码"+str(response.status)) # return html = response.text doc = Document(html) summary = doc.summary() title = doc.title() # print(summary) text = removeTags(summary) contenPath = saveToText(title, text) item = NewsItem() item['title'] = title item['url'] = response.url item['contenPath'] = contenPath yield item