Exemplo n.º 1
0
 def get_page_with_param(self, params):
     logger.debug('get_page_with_param: self.url=%s, params=%s' %
                  (self.url, params))
     if params == None:
         try:
             html_page = urllib2.urlopen(
                 self.url).read().decode('windows-1255').encode('utf-8')
         except urllib2.URLError:
             logger.error("can't open URL: %s" % self.url)
             send_chat_notification(__name__, 'failed to open url', {
                 'url': self.url,
                 'params': None
             })
             return None
         try:
             soup = BeautifulSoup(html_page)
         except HTMLParseError, e:
             logger.debug("parsing URL: %s - %s. will try harder." %
                          (self.url, e))
             html_page = re.sub(
                 "(?s)<!--.*?-->", " ",
                 html_page)  # cut anything that looks suspicious
             html_page = re.sub("(?s)<script>.*?</script>", " ", html_page)
             html_page = re.sub("(?s)<!.*?>", " ", html_page)
             try:
                 soup = BeautifulSoup(html_page)
             except HTMLParseError, e:
                 logger.debug("error parsing URL: %s - %s" % (self.url, e))
                 send_chat_notification(__name__, 'failed to parse url', {
                     'url': self.url,
                     'params': None
                 })
                 return None
Exemplo n.º 2
0
 def _scrape(self):
     try:
         html = self.source.fetch()
         soup = BeautifulSoup(html)
     except Exception, e:
         send_chat_notification(__file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL})
         raise e
Exemplo n.º 3
0
 def _scrape(self):
     try:
         html = self.source.fetch()
         soup = BeautifulSoup(html)
     except Exception, e:
         send_chat_notification(
             __file__, 'failed to fetch or parse the lobbyists index page',
             {'url': self.LOBBYISTS_INDEX_PAGE_URL})
         raise e
Exemplo n.º 4
0
 def get_page(self,url):
     html_page = None
     retry_count = 0
     while not(html_page):
         try:
             html_page = urllib2.urlopen(url, timeout=30).read()
         except urllib2.URLError:
             retry_count += 1
             if retry_count >= 10:
                 send_chat_notification(__name__, "URL failed too many times", {"url": url})
                 raise urllib2.URLError('URL %s failed too many times' % url)
     html_page = re.sub("(?s)<!--.*?-->"," ", html_page) # cut anything that looks suspicious
     html_page = re.sub("(?s)<script>.*?</script>"," ", html_page)
     return html_page
Exemplo n.º 5
0
 def _get_meetings(self, committee_id, from_date, to_date):
     try:
         meetings = DataserviceCommitteeMeeting.get(committee_id, from_date, to_date)
         return meetings
     except Exception as e:
         err_msg = ERR_MSG.format(committee_id)
         err_msg_report = ERR_MSG_REPORT.format(committee_id, str(e))
         DataserviceCommitteeMeeting.error_report(err_msg, err_msg_report)
         self._log_error(err_msg)
         send_chat_notification(__name__,
                                "Received unexpected exception from DataServiceCommitteeMeeting.get()",
                                {'exception': traceback.format_exc(),
                                 'committee_id': committee_id,
                                 'from_date': from_date,
                                 'to_date': to_date})
     return []
Exemplo n.º 6
0
def _get_committees_index_page(full):
    if full:
        url = FULL_URL
        encoding = "iso_8859_8"
    else:
        url = URL
        # encoding='utf8'
        # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8
        encoding = "iso_8859_8"
    logger.info("getting index page html from " + url)
    try:
        return unicode(urllib2.urlopen(url).read(), encoding)
    except:
        logger.error("could not fetch committees_index_page, exception: " + traceback.format_exc())
        send_chat_notification(__name__, "could not fetch committees index page", {"url": url})
        return ""
Exemplo n.º 7
0
def _get_committees_index_page(full):
    if full:
        url = FULL_URL
        encoding = 'iso_8859_8'
    else:
        url = PLENUM_URL
        # encoding='utf8'
        # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8
        encoding = 'iso_8859_8'
    logger.info('getting index page html from %s' % url)
    try:
        return unicode(urllib2.urlopen(url).read(), encoding)
    except:
        logger.exception(u'could not fetch committees_index_page for url %s' % url)
        send_chat_notification(__name__, "could not fetch committees index page", {'url': url})
        return ''
Exemplo n.º 8
0
def _get_committees_index_page(full):
    if full:
        url = FULL_URL
        encoding = 'iso_8859_8'
    else:
        url = URL
        # encoding='utf8'
        # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8
        encoding = 'iso_8859_8'
    logger.info('getting index page html from %s' % url)
    try:
        return unicode(urllib2.urlopen(url).read(), encoding)
    except:
        logger.exception(u'could not fetch committees_index_page for url %s' %
                         url)
        send_chat_notification(__name__,
                               "could not fetch committees index page",
                               {'url': url})
        return ''
Exemplo n.º 9
0
 def get_page_with_param(self,params):
     logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params))
     if params == None:
         try:
             html_page = urllib2.urlopen(self.url).read().decode('windows-1255').encode('utf-8')
         except urllib2.URLError:
             logger.error("can't open URL: %s" % self.url)
             send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': None})
             return None
         try:
             soup = BeautifulSoup(html_page)
         except HTMLParseError, e:
             logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e))
             html_page = re.sub("(?s)<!--.*?-->"," ", html_page) # cut anything that looks suspicious
             html_page = re.sub("(?s)<script>.*?</script>"," ", html_page)
             html_page = re.sub("(?s)<!.*?>"," ", html_page)
             try:
                 soup = BeautifulSoup(html_page)
             except HTMLParseError, e:
                 logger.debug("error parsing URL: %s - %s" % (self.url, e))
                 send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None})
                 return None
Exemplo n.º 10
0
    def get_page_with_param(self, params):
        logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params))
        if not params:
            try:
                html_page = urllib2.urlopen(self.url).read().decode('windows-1255').encode('utf-8')
            except urllib2.URLError as e:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': params})
                return None
            try:
                soup = BeautifulSoup(html_page)

            except HTMLParseError as e:
                logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e))
                html_page = re.sub("(?s)<!--.*?-->", " ", html_page)  # cut anything that looks suspicious
                html_page = re.sub("(?s)<script>.*?</script>", " ", html_page)
                html_page = re.sub("(?s)<!.*?>", " ", html_page)
                try:
                    soup = BeautifulSoup(html_page)
                except HTMLParseError as e:
                    logger.debug("error parsing URL: %s - %s" % (self.url, e))
                    send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None})
                    return None
            comments = soup.findAll(text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            return soup
        else:
            data = urllib.urlencode(params)
            try:
                url_data = urllib2.urlopen(self.url, data)
            except urllib2.URLError:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': data})
                return None
            html_page = url_data.read().decode('windows-1255').encode('utf-8')
            try:
                soup = BeautifulSoup(html_page)

            except HTMLParseError as e:
                logger.debug("error parsing URL: %s - %s" % (self.url, e))
                send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': data})
                return None
            comments = soup.findAll(text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            return soup
Exemplo n.º 11
0
    def get_page_with_param(self, params):
        logger.debug('get_page_with_param: self.url=%s, params=%s' %
                     (self.url, params))
        if not params:
            try:
                html_page = urllib2.urlopen(
                    self.url).read().decode('windows-1255').encode('utf-8')
            except urllib2.URLError as e:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {
                    'url': self.url,
                    'params': params
                })
                return None
            try:
                soup = BeautifulSoup(html_page)

            except HTMLParseError as e:
                logger.debug("parsing URL: %s - %s. will try harder." %
                             (self.url, e))
                html_page = re.sub(
                    "(?s)<!--.*?-->", " ",
                    html_page)  # cut anything that looks suspicious
                html_page = re.sub("(?s)<script>.*?</script>", " ", html_page)
                html_page = re.sub("(?s)<!.*?>", " ", html_page)
                try:
                    soup = BeautifulSoup(html_page)
                except HTMLParseError as e:
                    logger.debug("error parsing URL: %s - %s" % (self.url, e))
                    send_chat_notification(__name__, 'failed to parse url', {
                        'url': self.url,
                        'params': None
                    })
                    return None
            comments = soup.findAll(
                text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            return soup
        else:
            data = urllib.urlencode(params)
            try:
                url_data = urllib2.urlopen(self.url, data)
            except urllib2.URLError:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {
                    'url': self.url,
                    'params': data
                })
                return None
            html_page = url_data.read().decode('windows-1255').encode('utf-8')
            try:
                soup = BeautifulSoup(html_page)

            except HTMLParseError as e:
                logger.debug("error parsing URL: %s - %s" % (self.url, e))
                send_chat_notification(__name__, 'failed to parse url', {
                    'url': self.url,
                    'params': data
                })
                return None
            comments = soup.findAll(
                text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            return soup
Exemplo n.º 12
0
                html_page = re.sub("(?s)<script>.*?</script>"," ", html_page)
                html_page = re.sub("(?s)<!.*?>"," ", html_page)
                try:
                    soup = BeautifulSoup(html_page)
                except HTMLParseError, e:
                    logger.debug("error parsing URL: %s - %s" % (self.url, e))
                    send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None})
                    return None
            return soup
        else:
            data = urllib.urlencode(params)
            try:
                url_data = urllib2.urlopen(self.url,data)
            except urllib2.URLError:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': data})
                return None
            html_page = url_data.read().decode('windows-1255').encode('utf-8')
            try:
                soup = BeautifulSoup(html_page)
            except HTMLParseError, e:
                logger.debug("error parsing URL: %s - %s" % (self.url, e))
                send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': data})
                return None
            return soup

def fix_dash(s):
    """returns s with normalized spaces before and after the dash"""
    if not s:
        return None
    m = re.match(r'(תיקון)( ?)(-)( ?)(.*)'.decode('utf8'),s)
Exemplo n.º 13
0
                except HTMLParseError, e:
                    logger.debug("error parsing URL: %s - %s" % (self.url, e))
                    send_chat_notification(__name__, 'failed to parse url', {
                        'url': self.url,
                        'params': None
                    })
                    return None
            return soup
        else:
            data = urllib.urlencode(params)
            try:
                url_data = urllib2.urlopen(self.url, data)
            except urllib2.URLError:
                logger.error("can't open URL: %s" % self.url)
                send_chat_notification(__name__, 'failed to open url', {
                    'url': self.url,
                    'params': data
                })
                return None
            html_page = url_data.read().decode('windows-1255').encode('utf-8')
            try:
                soup = BeautifulSoup(html_page)
            except HTMLParseError, e:
                logger.debug("error parsing URL: %s - %s" % (self.url, e))
                send_chat_notification(__name__, 'failed to parse url', {
                    'url': self.url,
                    'params': data
                })
                return None
            return soup