Пример #1
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = parser.parse(
         get_first(
             response.selector.xpath(
                 '//meta[@property="vr:published_time"]/@content').extract(
                 ))).isoformat().encode('utf-8')
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:description"]/@content').extract()
     ).strip()
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="main-text "]/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="author"]/@content').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     item['resource'] = self.name
     item['publication_id'] = hashlib.sha1(
         (str(item['url']) + str(item['published']))).hexdigest()
     return item
Пример #2
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(
         response.selector.css('.headline').xpath('./text()').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="article-section clearfix"]/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//p[@class="author"]/a/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     return item
Пример #3
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8')
         for s in response.selector.css('.article__item').css(
             '.paragraph').xpath('.//text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.css('.byline').css(
             'span[itemprop="name"]').xpath('./text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     # Handle next pages
     next_page = get_first(
         response.selector.xpath('//link[@rel="next"]/@href').extract())
     if next_page:
         self.logger.debug("Next page found: " + next_page)
         yield Request(next_page, callback=self.parse_page)
     yield item
Пример #4
0
    def parse_page(self, response):
        """Scrapes information from pages into items"""
        #settings = get_project_settings()
        published = parser.parse(get_first(response.selector.xpath('//meta[@name="date"]/@content').extract()))
        published = published.replace(tzinfo=timezone('UTC'))
  #      earliest = parser.parse(settings.get('EARLIEST_PUBLISHED'))
 #       if published < earliest:
 #           raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
            #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
 #       else:
        item = CrawlerItem()
        item['url'] = response.url.encode('utf-8')
	item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
    	item['published'] = published.isoformat().encode('utf-8')
    	item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract())
        item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract())
        #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()])
    	item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()]
    	item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()]
    	item['resource'] = self.name
    	item['publication_id'] = hashlib.sha1((str(item['url']) + str(item['published']))).hexdigest()
   	# Handle next pages
    	next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract())
    	if next_page:
            self.logger.debug("Next page found: "+next_page)
            yield Request(next_page,callback=self.parse_page)
        #else:
        #    raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
        yield item
Пример #5
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = parser.parse(
         get_first(response.selector.xpath(
             '//time/@datetime').extract())).isoformat().encode('utf-8')
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.css(
             '.article>.body>p').xpath('.//text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8')
         for s in response.selector.css('.authorContainer').xpath(
             './/span/strong/span/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     item['resource'] = self.name
     item['publication_id'] = hashlib.sha1(
         (str(item['url']) + str(item['published']))).hexdigest()
     return item
Пример #6
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//time/@datetime').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.css(
             '.article-content>.rtf-content-wrapper>P').xpath(
                 './/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//div[@class="name"]/a[@rel="author"]/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     return item
Пример #7
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath(
             '//span[@class="Datum"]/@content').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:description"]/@content').extract()
     ).strip()
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="FAZArtikelText"]/div/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()'
         ).extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     return item
Пример #8
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract())
     item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.artContent').xpath('.//text()').extract()])
     item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="author"]/@content').extract()]
     item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()]
     return item
Пример #9
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(response.selector.xpath('//span[@class="Datum"]/@content').extract())
     item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(response.selector.xpath('//meta[@property="og:description"]/@content').extract()).strip()
     item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="FAZArtikelText"]/div/p/text()').extract()])
     item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//span[@class="Autor"]/span[@class="caps last"]/a/span/text()').extract()]
     item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()]
     return item
Пример #10
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(response.selector.css('.headline').xpath('./text()').extract())
     item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract())
     item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="article-section clearfix"]/p/text()').extract()])
     item['author'] = [s.encode('utf-8') for s in response.selector.xpath('//p[@class="author"]/a/text()').extract()]
     item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="news_keywords"]/@content').extract()]
     return item
Пример #11
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract())
     item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()])
     item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()]
     item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()]
     # Handle next pages
     next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract())
     if next_page:
         self.logger.debug("Next page found: "+next_page)
         yield Request(next_page,callback=self.parse_page)
     yield item
Пример #12
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     #settings = get_project_settings()
     published = parser.parse(
         get_first(response.selector.xpath('//time/@datetime').extract()))
     #published = published.replace(tzinfo=timezone('UTC'))
     #earliest = parser.parse(settings.get('EARLIEST_PUBLISHED'))
     # logger.warning(published)
     # logger.warning(earliest)
     # if published < earliest:
     #    raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
     #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
     # else:
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = published.isoformat().encode('utf-8')
     # logger.warning(parser.parse(get_first(response.selector.xpath('//time/@datetime').extract())))
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:description"]/@content').extract()
     ).strip()
     #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.xpath('//div[@class="atc-Text "]/p[@class="atc-TextParagraph"]/text()').extract()])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()'
         ).extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     item['resource'] = self.name
     item['publication_id'] = hashlib.sha1(
         (str(item['url']) + str(item['published']))).hexdigest()
     return item
Пример #13
0
 def parse_page(self, response):
     text = []
     for s in response.xpath('//text()').extract():
         if s.strip() != "":
             text.append(s.strip())
     if text:
         print "@@@@@@", response.url
         #print text
         item = CrawlerItem()
         item['url'] = response.url
         item['title'] = get_first(
             response.xpath('//title//text()').extract())
         item['text'] = text
         return item
     else:
         pass