Exemplo n.º 1
0
 def parse_article(self, response):
     text = ' '.join([x.strip() for x in response.css('div.article-body *::text').extract() if x])
     author = ' '.join([x.strip() for x in response.css('p.byline *::text').extract() if x])
     yield Article(
         title = response.meta.get('title', ''),
         url = response.meta.get('url', ''),
         published = response.meta.get('date', datetime.now()),
         content = text,
         author = author,
         source = 'San Francisco Chronicle'
     )
 def parse_article(self, response):
     text = ' '.join([
         x.strip() for x in response.css(
             'div.pb-f-article-body div[data-type=text] *::text').extract()
         if x
     ])
     author = ' '.join([
         x.strip() for x in response.css('span.byline *::text').extract()
         if x
     ])
     yield Article(title=response.meta.get('title', ''),
                   url=response.meta.get('url', ''),
                   published=response.meta.get('date', datetime.now()),
                   content=text,
                   author=author,
                   source='LA Times')
Exemplo n.º 3
0
 def parse_article(self, response):
     selector = '.pb-f-article-body'
     text = ' '.join([
         x.strip() for x in response.css(
             'div[itemprop=articleBody] *::text').extract() if x
     ])
     author = ' '.join([
         x.strip() for x in response.css('div.trb_ar_by *::text').extract()
         if x
     ])
     yield Article(title=response.meta.get('title', ''),
                   url=response.meta.get('url', ''),
                   published=response.meta.get('date', datetime.now()),
                   content=text,
                   author=author,
                   source='San Diego Union Tribune')
Exemplo n.º 4
0
 def parse_article(self, response):
     selector = '.pb-f-article-body'
     text = ' '.join([
         x.strip()
         for x in response.css('div#content-body *::text').extract() if x
     ])
     author = ' '.join([
         x.strip() for x in response.css('div.byline *::text').extract()
         if x
     ])
     yield Article(title=response.meta.get('title', ''),
                   url=response.meta.get('url', ''),
                   published=response.meta.get('date', datetime.now()),
                   content=text,
                   author=author,
                   source='Sacramento Bee')