Python error示例，scrapeNews.pipelines.loggerError.error Python示例

示例#1

0

显示文件

 def getPageDate(self, response):
     try:
         # split used to Spit Data in Correct format!
         data = (str(
             response.xpath("//script[@type='application/ld+json']").
             extract_first()).split('datePublished":"', 1)[1])[:19]
     except (TypeError, IndexError) as Error:
         # This fail case works only on very specific articles.
         scriptData = None
         scriptsList = response.xpath(
             "/html/head/script[not(contains(@type,'text/javascript'))]")
         for script in scriptsList:
             try:
                 scriptData = (script.extract()).split(
                     "<script>utag_data", 1)[1]
                 break
             except:
                 continue
         if (scriptData is not None):
             data = (scriptData.split('"publish_date":"',
                                      1)[1]).split("+", 1)[0]
         if (data is None):
             loggerError.error(response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data

示例#2

0

显示文件

文件： indianExpressTech.py 项目： atb00ker/scrape

 def getPageDate(self, response):
     data = response.xpath(
         '//meta[@itemprop="datePublished"]/@content').extract_first()
     if (data is None):
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data

示例#3

0

显示文件

 def getPageImage(self, response):
     data = response.xpath(
         '//div[@class="content"]/div/figure/img/@src').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#4

0

显示文件

文件： indianExpressTech.py 项目： atb00ker/scrape

 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#5

0

显示文件

    def parse_content(self, response):
        content = ''
        if 'gadgets.ndtv.com' in response.url:
            content_fragmented = response.css(
                'div.content_text>p::text').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'www.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'auto.ndtv.com' in response.url or 'sports.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@itemprop="articleBody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'food.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//span[@itemprop="articleBody"]/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        elif 'doctor.ndtv.com' in response.url:
            content_fragmented = response.xpath(
                '//div[@class="article_storybody"]/p/text()').extract()
            for c in content_fragmented:
                content += c.strip()
        else:
            loggerError.error('Could not handle parsing CONTENT at ' +
                              response.url)
            return None

        return content.strip()

示例#6

0

显示文件

 def getPageImage(self, response):
     data = response.xpath(
         "//meta[@property='og:image']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#7

0

显示文件

 def parse_image(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="fullstoryImage"]/picture/source/@srcset'
         ).extract_first()
     elif 'www.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, "ins_mainimage_big")]/img/@src'
         ).extract_first()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//img[@itemprop="url"]/@src').extract_first()
     elif 'food.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/meta[@itemprop="url"]/@content'
         ).extract_first()
     elif 'sports.ndtv.com' in response.url:
         return response.xpath(
             '//div[@itemprop="image"]/img[@class="caption"]/@src'
         ).extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[@class="article-stry-image"]/img/@src').extract_first()
     elif 'profit.ndtv.com' in response.url:
         return response.xpath(
             '///div[@id="story_pic"]/div/img/@src').extract_first()
     else:
         loggerError.error('Check for parsing IMAGE at ' + response.url)
         return None

示例#8

0

显示文件

文件： asianage.py 项目： atb00ker/scrape

 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='storyBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@id='storyBody']/p//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#9

0

显示文件

 def getPageDate(self, response):
     try:
         data = (response.xpath("//time/@datetime").extract_first()).rsplit('+',1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data

示例#10

0

显示文件

    def parse(self, response):

        # For the large newsBox in top of all the pages. (In Normal Pages) or sends all request for all the articles in API page or sends the request for the special page.
        try:
            newsBox = 'http://www.time.com' + response.xpath(
                "//div[@class='partial hero']/article/a/@href").extract_first(
                )
            if not self.postgres.checkUrlExists(newsBox):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
        except Exception as Error:
            if newsBox.xpath("//main[contains(@class,'content article')]"):
                yield scrapy.Request(
                    url=newsBox,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)
            elif newsBox.xpath("//div[contains(@class,'_29M-6C9w')]"):
                newsContainer = newsBox.xpath(
                    "//div[contains(@class,'_29M-6C9w')]//div[contains(@class,'_2cCPyP5f')]//a[@class='_2S9ChopF']/@href"
                )
                for link in newsContainer:
                    if not self.postgres.checkUrlExists(link):
                        yield scrapy.Request(
                            url=link,
                            callback=self.parse_article,
                            headers={
                                'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                            },
                            errback=self.errorRequestHandler)
            else:
                loggerError.error(response.url)

        # For the rest of the boxes
        newsContainer = response.xpath(
            "//div[@class='partial marquee']/article")
        for newsBox in newsContainer:
            link = 'http://www.time.com' + newsBox.xpath(
                'a/@href').extract_first()
            if not self.postgres.checkUrlExists(link):
                yield scrapy.Request(
                    url=link,
                    callback=self.parse_article,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
                    },
                    errback=self.errorRequestHandler)

示例#11

0

显示文件

文件： moneyControl.py 项目： atb00ker/scrape

 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath("/html/head/meta[@name='Last-Modified']/@content").extract_first()
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data

示例#12

0

显示文件

 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='content']//*[not(self::script)]/text()").
         extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#13

0

显示文件

文件： asianage.py 项目： atb00ker/scrape

 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath("//head/meta[@property='article:published_time']/@content").extract_first())
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data

示例#14

0

显示文件

文件： newsx.py 项目： atb00ker/scrape

 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data

示例#15

0

显示文件

 def getPageTitle(self, response):
     data = response.xpath(
         "//h1[contains(@class,'heading')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//meta[@property='og:title']/@content").extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#16

0

显示文件

 def getPageDate(self, response):
     try:
         data = (response.xpath(
             "/html/head/meta[@property='article:published_time']/@content"
         ).extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data

示例#17

0

显示文件

文件： moneyControl.py 项目： atb00ker/scrape

 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[@id='article-main']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(response.xpath("//meta[@property='og:description']/@content").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#18

0

显示文件

文件： newsx.py 项目： atb00ker/scrape

 def getPageImage(self, response):
     data = response.xpath(
         "//head/link[@rel='image_src']/@href").extract_first()
     if (data is None):
         data = response.xpath(
             "//div[@class='panel-body story']/div[@class='thumbnail video-thumbnail']/img/@src"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#19

0

显示文件

 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath(
             "//span[@class='dattime']/text()").extract()[1].rsplit(' ',
                                                                    3)[0]
     except Exception as Error:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     finally:
         return data

示例#20

0

显示文件

 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[contains(@class,'io-article-body')]/p/text()"
                        ).extract())
     if not data:
         data = response.xpath(
             "//meta[@property='og:description']/@content").extract_first()
     if data is None:
         loggerError.error(str(Error) + ' occured at: ' + response.url)
         data = 'Error'
     return data

示例#21

0

显示文件

文件： newsx.py 项目： atb00ker/scrape

 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath(
             "//head/meta[@itemprop='datePublished']/@content").
                 extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         loggerError.error(str(Error) + " occured at: " + response.url)
         data = 'Error'
     finally:
         return data

示例#22

0

显示文件

 def getPageContent(self, response):
     data = ' '.join(response.xpath("//div[contains(@class,'io-article-body')]//text()").extract())
     if not data:
         data = ' '.join(response.xpath("//div[contains(@id,'slider0')]/p/text()").extract())
     if not data:
         data = response.xpath("//article//*[not(self::script) and not(self::style)]/text()").extract()
         data = ' '.join([x for x in data if x != ' ' and x!=u'\xa0']) # Removing all the blank spaces & joining list
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#23

0

显示文件

 def getPageImage(self, response):
     try:
         data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@data-pagespeed-lazy-src").extract_first()
     except Exception as Error:
             data = response.xpath("//link[@rel='image_src']/@href").extract_first()
             if not data:
                 try:
                     data = 'https://hindi.oneindia.com' + response.xpath("//img[contains(@class,'image_listical')]/@src").extract_first()
                 except Exception as Error:
                     loggerError.error(str(Error) +' occured at: '+ response.url)
                     data = 'Error'
     return data

示例#24

0

显示文件

文件： indianExpressTech.py 项目： atb00ker/scrape

 def getPageContent(self, response):
     data = ' '.join(
         response.xpath("//div[@class='body-article']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='full-details']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath('//h2[@class="synopsis"]/text()').extract())
         loggerError.error(response.url)
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#25

0

显示文件

文件： indianExpressTech.py 项目： atb00ker/scrape

 def getPageImage(self, response):
     data = response.xpath(
         '//span[@class="custom-caption"]/img/@data-lazy-src'
     ).extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@itemprop='image']/meta[@itemprop='url']/@content"
         ).extract_first()
     if (data is None):
         try:
             data = ((response.xpath('//div[@class="lead-article"]/@style').
                      extract_first()).split('url(', 1)[1]).split(')', 1)[0]
         except Exception as Error:
             loggerError.error(str(Error) + " occured at: " + response.url)
             data = 'Error'
     return data

示例#26

0

显示文件

文件： newsx.py 项目： atb00ker/scrape

 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[@class='story-short-title']/h2/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/div//text()").extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#27

0

显示文件

 def getPageContent(self, response):
     data = ' '.join(
         response.xpath(
             "//div[contains(@class,'csmpn')]/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'aXjCH')]/div/p//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class) or @class='alltake_head']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@itemprop='articleBody']/p/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fulstorysharecomment']/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/ul/li/text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='fullstorydivstorycomment' or @class='fullstorydivstory' or @class='fulstorytext']//text()"
             ).extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[@class='_1mf _1mj' or @class='ttl']//text()").
             extract())
     if not data:
         data = ' '.join(
             response.xpath("//p[@class='news-intro']//text()").extract())
     if not data:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'csmpn')]/div[not(@class)]/text()").
             extract())
     if not data:
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#28

0

显示文件

 def getPageTitle(self, response):
     data = response.xpath(
         '//h1[@itemprop="headline"]/text()').extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[contains(@class,'headline')]/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//h1[@class='_8UFs4BVE']/text()").extract_first()
     if (data is None):
         data = response.xpath(
             "//span[@class='xxx_oneoff_special_story_v3_headline']/text()"
         ).extract_first()
     if (data is None):
         loggerError.error(response.url)
         data = 'Error'
     return data

示例#29

0

显示文件

文件： firstpostSports.py 项目： atb00ker/scrape

 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except AttributeError as Error:
         data = response.xpath(
             '//h1[@class="story-title"]/text()').extract_first()
         if (data is None):
             data = response.xpath(
                 '//h1[@class="page-title article-title"]/text()'
             ).extract_first()
         if (data is None):
             loggerError.error(Error, response.url)
             data = 'Error'
     except Exception as Error:
         loggerError.error(Error, response.url)
         data = 'Error'
     finally:
         return data

示例#30

0

显示文件

 def parse_title(self, response):
     if 'gadgets.ndtv.com' in response.url:
         return response.xpath('//div[@class="lead_heading"]/h1/span/text()'
                               ).extract_first().strip()
     elif 'www.ndtv.com' in response.url or 'food.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'sports.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@itemprop="headline"]/text()').extract_first().strip()
     elif 'auto.ndtv.com' in response.url:
         return response.xpath(
             '//h1[@class="article__headline"]/text()').extract_first()
     elif 'doctor.ndtv.com' in response.url:
         return response.xpath(
             '//div[contains(@class, article_heading)]/div[@class="__sslide"]/h1/text()'
         ).extract_first().strip()
     else:
         loggerError.error('Could not handle Parsing TITLE at ' +
                           response.url)
         return None