Пример #1
0
 def getPageDate(self, response):
     try:
         # split used to Spit Data in Correct format!
         data = (str(
             response.xpath("//script[@type='application/ld+json']").
             extract_first()).split('datePublished":"', 1)[1])[:19]
     except (TypeError, IndexError) as Error:
         # This fail case works only on very specific articles.
         scriptData = None
         scriptsList = response.xpath(
             "/html/head/script[not(contains(@type,'text/javascript'))]")
         for script in scriptsList:
             try:
                 scriptData = (script.extract()).split(
                     "<script>utag_data", 1)[1]
                 break
             except:
                 continue
         if (scriptData is not None):
             data = (scriptData.split('"publish_date":"',
                                      1)[1]).split("+", 1)[0]
         if (data is None):
             loggerError.error(response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
Пример #2
0
 def getPageDate(self, response):
     data = response.xpath(
         '//meta[@itemprop="datePublished"]/@content').extract_first()
     if (data is None):
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data
Пример #3
0
 def getPageImage(self, response):
     data = response.xpath(
         '//div[@class="content"]/div/figure/img/@src').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #4
0
 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #5
0
    def parse_content(self, response):
        content = ''
        if 'gadgets.ndtv.com' in response.url:
            content_fragmented = response.css(
                'div.content_text>p::text').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'www.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'auto.ndtv.com' in response.url or 'sports.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'food.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//span[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'doctor.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@class="article_storybody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        else:
            loggerError.error('Could not handle parsing CONTENT at ' +
                              response.url)
            return None

        return content.strip()
Пример #6
0
 def getPageImage(self, response):
     data = response.xpath(
         "//meta[@property='og:image']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #7
0
 def parse_image(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="fullstoryImage"]/picture/source/@srcset'
         ).extract_first()
     elif 'www.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, "ins_mainimage_big")]/img/@src'
         ).extract_first()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//img[@itemprop="url"]/@src').extract_first()
     elif 'food.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/meta[@itemprop="url"]/@content'
         ).extract_first()
     elif 'sports.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/img[@class="caption"]/@src'
         ).extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="article-stry-image"]/img/@src').extract_first()
     elif 'profit.ndtv.com' in response.url:
         return response.xpath(
             '///div[@id="story_pic"]/div/img/@src').extract_first()
     else:
         loggerError.error('Check for parsing IMAGE at ' + response.url)
         return None
Пример #8
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='storyBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@id='storyBody']/p//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #9
0
 def getPageDate(self, response):
     try:
         data = (response.xpath("//time/@datetime").extract_first()).rsplit('+',1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
Пример #10
0
    def parse(self, response):

        # For the large newsBox in top of all the pages. (In Normal Pages) or sends all request for all the articles in API page or sends the request for the special page.
        try:
            newsBox = 'http://www.time.com' + response.xpath(
                "//div[@class='partial hero']/article/a/@href").extract_first(
                )
            if not self.postgres.checkUrlExists(newsBox):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
        except Exception as Error:
            if newsBox.xpath("//main[contains(@class,'content article')]"):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
            elif newsBox.xpath("//div[contains(@class,'_29M-6C9w')]"):
                newsContainer = newsBox.xpath(
                    "//div[contains(@class,'_29M-6C9w')]//div[contains(@class,'_2cCPyP5f')]//a[@class='_2S9ChopF']/@href"
                )
                for link in newsContainer:
                    if not self.postgres.checkUrlExists(link):
                        yield scrapy.Request(
                            url=link,
                            callback=self.parse_article,
                            headers={
                                'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                            },
                            errback=self.errorRequestHandler)
            else:
                loggerError.error(response.url)

        # For the rest of the boxes
        newsContainer = response.xpath(
            "//div[@class='partial marquee']/article")
        for newsBox in newsContainer:
            link = 'http://www.time.com' + newsBox.xpath(
                'a/@href').extract_first()
            if not self.postgres.checkUrlExists(link):
                yield scrapy.Request(
                    url=link,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
Пример #11
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath("/html/head/meta[@name='Last-Modified']/@content").extract_first()
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
Пример #12
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='content']//*[not(self::script)]/text()").
         extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #13
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath("//head/meta[@property='article:published_time']/@content").extract_first())
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
Пример #14
0
 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data
Пример #15
0
 def getPageTitle(self, response):
     data = response.xpath(
         "//h1[contains(@class,'heading')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//meta[@property='og:title']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #16
0
 def getPageDate(self, response):
     try:
         data = (response.xpath(
             "/html/head/meta[@property='article:published_time']/@content"
         ).extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
Пример #17
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='article-main']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//meta[@property='og:description']/@content").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #18
0
 def getPageImage(self, response):
     data = response.xpath(
         "//head/link[@rel='image_src']/@href").extract_first()
     if (data is None):
         data = response.xpath(
             "//div[@class='panel-body story']/div[@class='thumbnail video-thumbnail']/img/@src"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #19
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath(
             "//span[@class='dattime']/text()").extract()[1].rsplit(' ',
                                                                    3)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data
Пример #20
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[contains(@class,'io-article-body')]/p/text()"
                        ).extract())
     if not data:
         data = response.xpath(
             "//meta[@property='og:description']/@content").extract_first()
     if data is None:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data
Пример #21
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath(
             "//head/meta[@itemprop='datePublished']/@content").
                 extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data
Пример #22
0
 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[contains(@class,'io-article-body')]//text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[contains(@id,'slider0')]/p/text()").extract())
     if not data:
         data = response.xpath("//article//*[not(self::script) and not(self::style)]/text()").extract()
         data = ' '.join([x for x in data if x != ' ' and x!=u'\xa0']) # Removing all the blank spaces & joining list
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #23
0
 def getPageImage(self, response):
     try:
         data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@data-pagespeed-lazy-src").extract_first()
     except Exception as Error:
             data = response.xpath("//link[@rel='image_src']/@href").extract_first()
             if not data:
                 try:
                     data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@src").extract_first()
                 except Exception as Error:
                     loggerError.error(str(Error) +' occured at: '+ response.url)
                     data = 'Error'
     return data
Пример #24
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[@class='body-article']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='full-details']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath('//h2[@class="synopsis"]/text()').extract())
         loggerError.error(response.url)
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #25
0
 def getPageImage(self, response):
     data = response.xpath(
         '//span[@class="custom-caption"]/img/@data-lazy-src'
     ).extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@itemprop='image']/meta[@itemprop='url']/@content"
         ).extract_first()
     if (data is None):
         try:
             data = ((response.xpath('//div[@class="lead-article"]/@style').
                      extract_first()).split('url(', 1)[1]).split(')', 1)[0]
         except Exception as Error:
             loggerError.error(str(Error) + " occured at: " + response.url)
             data = 'Error'
     return data
Пример #26
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='story-short-title']/h2/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/div//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #27
0
 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[contains(@class,'csmpn')]/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'aXjCH')]/div/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class) or @class='alltake_head']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fulstorysharecomment']/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/ul/li/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fullstorydivstorycomment' or @class='fullstorydivstory' or @class='fulstorytext']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='_1mf _1mj' or @class='ttl']//text()").
             extract())
     if not data:
         data = ' '.join(
             response.xpath("//p[@class='news-intro']//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class)]/text()").
             extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #28
0
 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[contains(@class,'headline')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[@class='_8UFs4BVE']/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@class='xxx_oneoff_special_story_v3_headline']/text()"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data
Пример #29
0
 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except AttributeError as Error:
         data = response.xpath(
             '//h1[@class="story-title"]/text()').extract_first()
         if (data is None):
             data = response.xpath(
                 '//h1[@class="page-title article-title"]/text()'
             ).extract_first()
         if (data is None):
             loggerError.error(Error, response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data
Пример #30
0
 def parse_title(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath('//div[@class="lead_heading"]/h1/span/text()'
                               ).extract_first().strip()
     elif 'www.ndtv.com' in response.url or 'food.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'sports.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@class="article__headline"]/text()').extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, article_heading)]/div[@class="__sslide"]/h1/text()'
         ).extract_first().strip()
     else:
         loggerError.error('Could not handle Parsing TITLE at ' +
                           response.url)
         return None