示例#1
0
 def getPageDate(self, response):
     try:
         # split used to Spit Data in Correct format!
         data = (str(
             response.xpath("//script[@type='application/ld+json']").
             extract_first()).split('datePublished":"', 1)[1])[:19]
     except (TypeError, IndexError) as Error:
         # This fail case works only on very specific articles.
         scriptData = None
         scriptsList = response.xpath(
             "/html/head/script[not(contains(@type,'text/javascript'))]")
         for script in scriptsList:
             try:
                 scriptData = (script.extract()).split(
                     "<script>utag_data", 1)[1]
                 break
             except:
                 continue
         if (scriptData is not None):
             data = (scriptData.split('"publish_date":"',
                                      1)[1]).split("+", 1)[0]
         if (data is None):
             loggerError.error(response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
示例#2
0
 def getPageDate(self, response):
     data = response.xpath(
         '//meta[@itemprop="datePublished"]/@content').extract_first()
     if (data is None):
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data
示例#3
0
 def getPageImage(self, response):
     data = response.xpath(
         '//div[@class="content"]/div/figure/img/@src').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#4
0
 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#5
0
    def parse_content(self, response):
        content = ''
        if 'gadgets.ndtv.com' in response.url:
            content_fragmented = response.css(
                'div.content_text>p::text').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'www.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'auto.ndtv.com' in response.url or 'sports.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'food.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//span[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'doctor.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@class="article_storybody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        else:
            loggerError.error('Could not handle parsing CONTENT at ' +
                              response.url)
            return None

        return content.strip()
示例#6
0
 def getPageImage(self, response):
     data = response.xpath(
         "//meta[@property='og:image']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#7
0
 def parse_image(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="fullstoryImage"]/picture/source/@srcset'
         ).extract_first()
     elif 'www.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, "ins_mainimage_big")]/img/@src'
         ).extract_first()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//img[@itemprop="url"]/@src').extract_first()
     elif 'food.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/meta[@itemprop="url"]/@content'
         ).extract_first()
     elif 'sports.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/img[@class="caption"]/@src'
         ).extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="article-stry-image"]/img/@src').extract_first()
     elif 'profit.ndtv.com' in response.url:
         return response.xpath(
             '///div[@id="story_pic"]/div/img/@src').extract_first()
     else:
         loggerError.error('Check for parsing IMAGE at ' + response.url)
         return None
示例#8
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='storyBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@id='storyBody']/p//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#9
0
 def getPageDate(self, response):
     try:
         data = (response.xpath("//time/@datetime").extract_first()).rsplit('+',1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
示例#10
0
    def parse(self, response):

        # For the large newsBox in top of all the pages. (In Normal Pages) or sends all request for all the articles in API page or sends the request for the special page.
        try:
            newsBox = 'http://www.time.com' + response.xpath(
                "//div[@class='partial hero']/article/a/@href").extract_first(
                )
            if not self.postgres.checkUrlExists(newsBox):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
        except Exception as Error:
            if newsBox.xpath("//main[contains(@class,'content article')]"):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
            elif newsBox.xpath("//div[contains(@class,'_29M-6C9w')]"):
                newsContainer = newsBox.xpath(
                    "//div[contains(@class,'_29M-6C9w')]//div[contains(@class,'_2cCPyP5f')]//a[@class='_2S9ChopF']/@href"
                )
                for link in newsContainer:
                    if not self.postgres.checkUrlExists(link):
                        yield scrapy.Request(
                            url=link,
                            callback=self.parse_article,
                            headers={
                                'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                            },
                            errback=self.errorRequestHandler)
            else:
                loggerError.error(response.url)

        # For the rest of the boxes
        newsContainer = response.xpath(
            "//div[@class='partial marquee']/article")
        for newsBox in newsContainer:
            link = 'http://www.time.com' + newsBox.xpath(
                'a/@href').extract_first()
            if not self.postgres.checkUrlExists(link):
                yield scrapy.Request(
                    url=link,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
示例#11
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath("/html/head/meta[@name='Last-Modified']/@content").extract_first()
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
示例#12
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='content']//*[not(self::script)]/text()").
         extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#13
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath("//head/meta[@property='article:published_time']/@content").extract_first())
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
示例#14
0
文件: newsx.py 项目: atb00ker/scrape
 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data
示例#15
0
 def getPageTitle(self, response):
     data = response.xpath(
         "//h1[contains(@class,'heading')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//meta[@property='og:title']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#16
0
 def getPageDate(self, response):
     try:
         data = (response.xpath(
             "/html/head/meta[@property='article:published_time']/@content"
         ).extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
示例#17
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='article-main']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//meta[@property='og:description']/@content").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#18
0
文件: newsx.py 项目: atb00ker/scrape
 def getPageImage(self, response):
     data = response.xpath(
         "//head/link[@rel='image_src']/@href").extract_first()
     if (data is None):
         data = response.xpath(
             "//div[@class='panel-body story']/div[@class='thumbnail video-thumbnail']/img/@src"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#19
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath(
             "//span[@class='dattime']/text()").extract()[1].rsplit(' ',
                                                                    3)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
示例#20
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[contains(@class,'io-article-body')]/p/text()"
                        ).extract())
     if not data:
         data = response.xpath(
             "//meta[@property='og:description']/@content").extract_first()
     if data is None:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data
示例#21
0
文件: newsx.py 项目: atb00ker/scrape
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath(
             "//head/meta[@itemprop='datePublished']/@content").
                 extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data
示例#22
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[contains(@class,'io-article-body')]//text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[contains(@id,'slider0')]/p/text()").extract())
     if not data:
         data = response.xpath("//article//*[not(self::script) and not(self::style)]/text()").extract()
         data = ' '.join([x for x in data if x != ' ' and x!=u'\xa0']) # Removing all the blank spaces & joining list
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#23
0
 def getPageImage(self, response):
     try:
         data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@data-pagespeed-lazy-src").extract_first()
     except Exception as Error:
             data = response.xpath("//link[@rel='image_src']/@href").extract_first()
             if not data:
                 try:
                     data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@src").extract_first()
                 except Exception as Error:
                     loggerError.error(str(Error) +' occured at: '+ response.url)
                     data = 'Error'
     return data
示例#24
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[@class='body-article']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='full-details']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath('//h2[@class="synopsis"]/text()').extract())
         loggerError.error(response.url)
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#25
0
 def getPageImage(self, response):
     data = response.xpath(
         '//span[@class="custom-caption"]/img/@data-lazy-src'
     ).extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@itemprop='image']/meta[@itemprop='url']/@content"
         ).extract_first()
     if (data is None):
         try:
             data = ((response.xpath('//div[@class="lead-article"]/@style').
                      extract_first()).split('url(', 1)[1]).split(')', 1)[0]
         except Exception as Error:
             loggerError.error(str(Error) + " occured at: " + response.url)
             data = 'Error'
     return data
示例#26
0
文件: newsx.py 项目: atb00ker/scrape
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='story-short-title']/h2/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/div//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#27
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[contains(@class,'csmpn')]/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'aXjCH')]/div/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class) or @class='alltake_head']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fulstorysharecomment']/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/ul/li/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fullstorydivstorycomment' or @class='fullstorydivstory' or @class='fulstorytext']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='_1mf _1mj' or @class='ttl']//text()").
             extract())
     if not data:
         data = ' '.join(
             response.xpath("//p[@class='news-intro']//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class)]/text()").
             extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#28
0
 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[contains(@class,'headline')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[@class='_8UFs4BVE']/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@class='xxx_oneoff_special_story_v3_headline']/text()"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
示例#29
0
 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except AttributeError as Error:
         data = response.xpath(
             '//h1[@class="story-title"]/text()').extract_first()
         if (data is None):
             data = response.xpath(
                 '//h1[@class="page-title article-title"]/text()'
             ).extract_first()
         if (data is None):
             loggerError.error(Error, response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
示例#30
0
 def parse_title(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath('//div[@class="lead_heading"]/h1/span/text()'
                               ).extract_first().strip()
     elif 'www.ndtv.com' in response.url or 'food.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'sports.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@class="article__headline"]/text()').extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, article_heading)]/div[@class="__sslide"]/h1/text()'
         ).extract_first().strip()
     else:
         loggerError.error('Could not handle Parsing TITLE at ' +
                           response.url)
         return None