예제 #1
0
 def getPageContent(self, response):
     try:
         data = ' '.join(
             response.xpath(
                 "//div[@id='article-body']/div/p/text()").extract())
         if not data:
             data = ' '.join(
                 response.xpath(
                     "//section[@class='chapter']//text()").extract())
         if not data:
             data = ' '.join(
                 response.xpath(
                     "//div[contains(@class,'-5s7sjXv')]/div/div/article/p/text()"
                 ).extract())
         if not data:
             data = ' '.join(
                 response.xpath(
                     "//div[contains(@class,'_1Joi0PLr')]//span/text()").
                 extract())
         if not data:
             logger.error(__name__ + " Unable to Extract Content  : " +
                          response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ +
                      " [UNHANDLED] Unable to Extract Content : " + str(e) +
                      " : " + response.url)
         data = 'Error'
     return data
예제 #2
0
def get_logs_or_items(sql, sort, order, queries):

    items = {
        'total': 0,
        'rows': []
    }

    filter_str = "SELECT * FROM res_main, res_count \
                   ORDER BY res_main." + sort + " " + order + " \
                   OFFSET %s FETCH NEXT %s ROWS ONLY;"

    sql += filter_str

    try:
        conn = postgresSQL()
        conn.cursor.execute(sql, queries)

        data = conn.RealDictCursor.fetchall()
        for row in data:
            item = {}
            for column in row:
                if column == "total":
                    pass
                item[column] = str(row[column])
            items['rows'].append(item)
        if data != None:
            items['total'] = data[0]['total']
        else:
            items['total'] = 0
    except Exception as e:
        items['total'] = 0
        logger.error(__name__ + " " +str(e))
    return json.dumps(items)
예제 #3
0
 def parse_title(self, response):
     try:
         if 'gadgets.ndtv.com' in response.url:
             return response.xpath(
                 '//div[@class="lead_heading"]/h1/span/text()'
             ).extract_first().strip()
         elif 'www.ndtv.com' in response.url or 'food.ndtv.com' in response.url:
             return response.xpath('//h1[@itemprop="headline"]/text()'
                                   ).extract_first().strip()
         elif 'sports.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
             return response.xpath('//h1[@itemprop="headline"]/text()'
                                   ).extract_first().strip()
         elif 'auto.ndtv.com' in response.url:
             return response.xpath(
                 '//h1[@class="article__headline"]/text()').extract_first()
         elif 'doctor.ndtv.com' in response.url:
             return response.xpath(
                 '//div[contains(@class, article_heading)]/div[@class="__sslide"]/h1/text()'
             ).extract_first().strip()
         else:
             logger.error(__name__ + ' Unable to Extract Image ' +
                          response.url)
             return None
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " +
                      str(e) + " : " + response.url)
         return None
예제 #4
0
파일: db.py 프로젝트: vipulgupta2048/scrape
    def insertIntoNewsTable(self, item, log_id):
        # Insert item into NEWS_TABLE after all the processing.
        try:
            # Verify the Connection to Database
            if not self.checkConnection():
                return False
            # Prepare the Query
            postgresQuery = "INSERT INTO " + DB_INFO['NEWS_TABLE'] + " (title, content, image, link, newsDate, site_id, log_id, datescraped) VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())"

            # Execute the Query
            self.cursor.execute(postgresQuery,
                (item['title'],
                item['content'],
                item['image'],
                item['link'],
                item['newsDate'],
                item['source'],
                log_id))
            return True
        except psycopg2.Error as Error:
            # If the link already exists, this exception will be invoked
            if (str(Error.pgcode) == '23505'):
                pass
            else:
                logger.error(__name__ + "(" + str(Error.pgcode) + ") " + str(Error) + " occured at " + str(item.get('link')))
        except Exception as Error:
            logger.error(__name__ + " Error While Inserting to Database : " + str(Error))
        
        return False
예제 #5
0
    def parse_date(self, response):
        try:
            date = ''
            if 'www.ndtv.com' in response.url:
                date = response.xpath('//meta[@name="modified-date"]/@content'
                                      ).extract_first()[:-6]
            elif 'doctor.ndtv.com' in response.url or 'sports.ndtv.com' in response.url:
                date = response.xpath('//meta[@name="publish-date"]/@content'
                                      ).extract_first()[:-6]
            elif 'auto.ndtv.com' in response.url:
                date = response.xpath(
                    '//meta[@itemprop="datePublished"]/@content'
                ).extract_first()[:-6]
            elif 'food.ndtv.com' in response.url:
                date = response.xpath(
                    '//span[@itemprop="dateModified"]/@content').extract_first(
                    )[:-6]
            elif 'gadgets.ndtv.com' in response.url or 'profit.ndtv.com' in response.url:
                date = response.xpath('//meta[@name="publish-date"]/@content'
                                      ).extract_first()[:-6]
            else:
                logger.error(__name__ + ' Unable to Extract Date ' +
                             response.url)
                return None

            date = (parser.parse(date)).strftime('%Y-%m-%dT%H:%M:%S')

        except Exception as e:
            logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " +
                         str(e) + " : " + response.url)
            return None

        return date
예제 #6
0
파일: db.py 프로젝트: vipulgupta2048/scrape
    def connect(self):
        # Check if connection already exists
        if self.connection != None:
            # Is Connection Closed?
            if self.connection.closed == 0:
                # If Not, Return the current cursor
                return self.cursor
        # Create a New Connection
        try:
            # Connect to Database
            self.connection = psycopg2.connect(host= DB_INFO['HOST_NAME'],
                user=DB_INFO['USERNAME'],
                database=DB_INFO['DATABASE_NAME'],
                password=DB_INFO['PASSWORD'])
            # Enable AutoCommit
            self.connection.autocommit = True
            # Set Cursor to DictCursor
            self.cursor = self.connection.cursor(cursor_factory = psycopg2.extras.DictCursor)
            self.RealDictCursor = self.connection.cursor(cursor_factory = rdc)

            logger.debug(__name__+" Connected to Database")
            
            # Try Initializing the Database
            if not self.initilaize():
                logger.error(__name__ + " Database Cannot be initialiazed automatically. Try it Manually.")
            
            return self.cursor
        except Exception as e:
            logger.critical(__name__+" Database Connection Error! Msg: " + str(e))
            if self.connection != None:
                self.connection.close()
                self.connection = None

        return False
예제 #7
0
 def parse_more_news(self, response):
     try:
         ajax_response = json.loads(response.text)
         self.news_id = ajax_response['min_news_id']
         html = Selector(text=ajax_response['html'])
         for news in html.css('div.news-card'):
             self.urls_parsed += 1
             try:
                 item = ScrapenewsItem()
                 item['image'] = news.css('div.news-card-image::attr(style)'
                                          ).extract_first()[23:-3]
                 item['title'] = news.css(
                     'a.clickable>span::text').extract_first()
                 item['content'] = news.css(
                     'div[itemprop*=articleBody]::text').extract_first()
                 item['newsDate'] = news.css(
                     'span.time::attr(content)').extract_first()[:-5]
                 item['link'] = news.css(
                     'div.read-more>a::attr(href)').extract_first()
                 item['source'] = 105
                 yield item
                 self.urls_scraped += 1
             except Exception as e:
                 logger.error(__name__ +
                              " [UNHANDLED] Unable to Extract Data : " +
                              str(e))
                 self.urls_dropped += 1
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) +
                      " for response url " + response.url)
예제 #8
0
 def getPageContent(self, response):
     try:
         data = ' '.join(
             response.xpath(
                 "//div[contains(@class,'io-article-body')]//text()").
             extract())
         if not data:
             data = ' '.join(
                 response.xpath(
                     "//div[contains(@id,'slider0')]/p/text()").extract())
         if not data:
             data = response.xpath(
                 "//article//*[not(self::script) and not(self::style)]/text()"
             ).extract()
             data = ' '.join([
                 x for x in data if x != ' ' and x != u'\xa0'
             ])  # Removing all the blank spaces & joining list
         if not data:
             logger.error(__name__ + " Unable to extract Content : " +
                          response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ +
                      " [UNHANDLED] Unable to extract Content : " + str(e) +
                      " : " + response.url)
         data = 'Error'
     return data
예제 #9
0
    def getPageDate(self, response):
        try:
            # split used to Spit Data in Correct format!
            data = (str(
                response.xpath("//script[@type='application/ld+json']").
                extract_first()).split('datePublished":"', 1)[1])[:19]
        except (TypeError, IndexError) as Error:
            # This fail case works only on very specific articles.
            scriptData = None
            scriptsList = response.xpath(
                "/html/head/script[not(contains(@type,'text/javascript'))]")
            for script in scriptsList:
                try:
                    scriptData = (script.extract()).split(
                        "<script>utag_data", 1)[1]
                    break
                except:
                    continue
            if (scriptData is not None):
                data = (scriptData.split('"publish_date":"',
                                         1)[1]).split("+", 1)[0]
            if (data is None):
                logger.error(__name__ + " Unable to Extract Date : " +
                             response.url)
                data = 'Error'
        except Exception as Error:
            logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " +
                         str(Error) + " : " + response.url)
            data = 'Error'

        return data
예제 #10
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath("//span[@class='dattime']/text()").extract()[1].rsplit(' ', 3)[0]
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url)
         data = 'Error'
     return data
예제 #11
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = response.xpath("/html/head/meta[@name='Last-Modified']/@content").extract_first()
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(Error) + " : " + response.url)
         data = 'Error'
     return data
예제 #12
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath("//head/meta[@property='article:published_time']/@content").extract_first())
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " + str(e) + " : " + response.url)
         data = 'Error'        
     return data
예제 #13
0
 def process_date(self, date):
     """ Processes Date and tries to convert it to Valid Python DateTime Object. Returns Formatted String """
     try:
         parsed_date = parser.parse(date, ignoretz=False, fuzzy=True)
         return str(parsed_date)
     except Exception as e:
         logger.error(__name__ + " Unable to Parse Date (Input: " +
                      str(date) + ") due to " + str(e))
         raise DropItem("Unable to Parse Date due to " + str(e))
예제 #14
0
 def getPageDate(self, response):
     try:
         data = (response.xpath("//time/@datetime").extract_first()).rsplit(
             '+', 1)[0]
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to extract Date : " +
                      str(Error) + ' : ' + response.url)
         data = 'Error'
     return data
예제 #15
0
 def getPageTitle(self, response):
     try:
         data = ' '.join(
             response.xpath("//h1[@itemprop='headline']/text()").
             extract_first().split())
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " +
                      str(Error) + " : " + response.url)
         data = 'Error'
     return data
예제 #16
0
 def getPageTitle(self, response):
     try:
         data = response.xpath('//h1[@class="arttitle"]/text()').extract_first()
         if (data is None):
             logger.error(__name__ + " Unable to Extract Page Title: " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Page Title : " + str(e) + " : " + response.url)
         data = 'Error' 
     return data
예제 #17
0
 def gettitle(self,response):
     try:
         data = response.xpath('//h1[contains(@class, "article-heading margin")]/text()').extract_first() 
         if (data is None):
             logger.error(__name__ + " Unable to Extract Title : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + str(e) + " : " + response.url)
         data = 'Error'
     return data   
예제 #18
0
 def getPageDate(self, response):
     try:
         data = (response.xpath(
             "/html/head/meta[@property='article:published_time']/@content"
         ).extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " +
                      str(Error) + ' : ' + response.url)
         data = 'Error'
     return data
예제 #19
0
 def getPageContent(self, response):
     try:
         data = ' '.join(response.xpath("//div[@class='content']//*[not(self::script)]/text()").extract())
         if not data:
             logger.error(__name__ + " Unable to Extract Content : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Content : " + str(e) + " : "+ response.url)
         data = 'Error'
     return data
예제 #20
0
 def getPageTitle(self, response):
     try:
         data = response.xpath("head/title/text()").extract_first()
         if (data is None):
             logger.error(__name__ + " Unable to extract page title " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to extract page title : " + str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #21
0
 def getimage(self,response):
     try:
         data = response.xpath('//div[contains(@class, "field-item")]/img/@src').extract_first() #scrapes image url
         if (data is None):
             logger.error(__name__ + " Unable to Extract Image : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Image : " + str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #22
0
 def getcontent(self,response):
     try:
         data = response.xpath('//div[contains(@class, "article")]/div[contains(@class, "field")]//p/text()').extract()
         if (data is None):
             logger.error(__name__ + " [UNHANDLED] Unable to Extract Title : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Content : " + str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #23
0
 def getPageImage(self, response):
     try:
         data = response.xpath('//div[@class="content"]/div/figure/img/@src').extract_first()
         if (data is None):
             logger.error(__name__ + " Unable to Extract Image : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Image : " + str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #24
0
 def getPageImage(self, response):
     try:
         data = response.xpath("//meta[@property='og:image']/@content").extract_first()
         if (data is None):
             logger.error(__name__ + " Unable to Extract Image : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to extract Image : " + str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #25
0
 def getPageDate(self, response):
     try:
         # split & rsplit Used to Spit Data in Correct format!
         data = (response.xpath(
             "//head/meta[@itemprop='datePublished']/@content").
                 extract_first()).rsplit('+', 1)[0]
     except Exception as Error:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " +
                      str(Error) + " : " + response.url)
         data = 'Error'
     return data
예제 #26
0
 def getdatetime(self,response):
     try:
         data = response.xpath('//span[contains(@class, "date")]/text()').extract_first()[10:-4] 
         z = (datetime.strptime(data," %b %d, %Y, %H:%M %p")).strftime("%Y-%m-%d %H:%M:%S")  
         if (z is None):
             logger.error(__name__ + " Unable to Extract Title : " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Date : " + str(e) + " : " + response.url)
         data = 'Error'
     return data   
예제 #27
0
 def getlink(self, response):
     try:
         data = response.url
         if data == self.start_urls:
             logger.error(__name__ + " Unable to Extract Link : " +
                          response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to Extract Link : " +
                      str(e) + " : " + response.url)
         data = 'Error'
     return data
예제 #28
0
 def parse(self, response):
     try:
         newsContainer = response.xpath("//ul[@id='cagetory']/li[@class='clearfix']")
         for newsBox in newsContainer:
             link = newsBox.xpath('a/@href').extract_first()
             if not self.postgres.checkUrlExists(link):
                 self.urls_parsed += 1
                 yield scrapy.Request(url=link, callback=self.parse_article, errback=self.errorRequestHandler)
             else:
                 self.urls_dropped += 1
     except Exception as e:
         logger.error(__name__+ " [UNHANDLED] " + str(e) + " for response url " + response.url)
예제 #29
0
 def parse(self, response):
     try:
         newsContainer = response.xpath("//div[@class='singlesunday']")
         for newsBox in newsContainer:
             link = 'http://www.asianage.com' + newsBox.xpath('div/h2/a/@href').extract_first()
             if not self.postgres.checkUrlExists(link):
                 self.urls_parsed += 1
                 yield scrapy.Request(url=link, callback=self.parse_article, errback=self.errorRequestHandler)
             else:
                 self.urls_dropped += 1
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
예제 #30
0
 def getPageContent(self, response):
     try:
         data = ' '.join(response.xpath("//div[@id='storyBody']/p/text()").extract())
         if not data:
             data = ' '.join(response.xpath("//div[@id='storyBody']/p//text()").extract())
         if not data:
             logger.error(__name__ + " Unable to extract page content " + response.url)
             data = 'Error'
     except Exception as e:
         logger.error(__name__ + " [UNHANDLED] Unable to extract Content : " + str(e) + " : " + response.url)
         data = 'Error'
     return data