Пример #1
0
    def http_error_301(self, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_301(
                        self, req, fp, code, msg, headers)

        result.status = code
        logger.debug('Redirect URL (301): %s' %  result.url)
        return result
Пример #2
0
 def log_doc(self, doc):
     txt = 'Document saved:\n'
     for key,item in doc.items():
         if key not in ('summary', 'document'):
             txt += '   %-12s: %s\n' % (key, item)
     logger.debug(txt[:-1])
     logger.warn('Got *new* document: %(doc_type)s %(number)s %(date_st)s' % doc )
Пример #3
0
 def process_digesto(self, doc_obj):
     '''
     Gets more information from the digesto system
     Extracts the document html text from the digesto system
     '''
     # Do we have a digesto entry? If not, return
     if not self.doc.data['digesto']:
         logger.debug(msg_doc('No digesto:', self.doc))
         return
     # Check for digesto text
     document_text=self.check_digesto(doc_obj)
     # If it does not exist or we have a forced update read the html
     if not document_text:
         logger.debug(msg_doc('New digesto:', self.doc))
         document_text = DocumentText()
     elif document_text and self.options['update_digesto']:
         logger.debug(msg_doc('Update digesto:', self.doc))
     else:
         logger.debug(msg_doc('Already have digesto:', self.doc))
         return
     # Get the digesto text
     text = self.get_digesto()
     if not text:
         logger.debug(msg_doc('No digesto text:', self.doc))
         return
     # Save the text
     self.save_digesto(document_text, doc_obj, text)
Пример #4
0
 def save_doc(self):
     # Check for document duplication
     doc_obj = self.check_duplicate()
     if self.mode == UPDATE:
         if not self.options["update"]:
             logger.debug(msg_doc("IGNORING duplicated document:", self.doc))
             raise DREDuplicateError("Not going to process this doc.")
         else:
             logger.warn(msg_doc("UPDATE mode:", self.doc))
             logger.debug("doc_obj: %s" % doc_obj)
     else:
         logger.warn(msg_doc("NEW mode:", self.doc))
     # Save metadata
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_metadata"]):
         logger.debug(msg_doc("Metadata:", self.doc))
         self.save_metadata(doc_obj)
         self.check_forgetme(doc_obj)
     # Save digesto
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_digesto"]):
         self.process_digesto(doc_obj)
     # Update inforce
     if self.mode == NEW or (self.mode == UPDATE and self.options["update_inforce"]):
         logger.debug(msg_doc("Update inforce:", self.doc))
         self.update_inforce(doc_obj)
     # Save PDF
     if self.mode == NEW or (self.mode == UPDATE and self.options["save_pdf"]):
         logger.debug(msg_doc("Get PDF:", self.doc))
         self.save_pdf(doc_obj)
Пример #5
0
    def get_digesto( self, doc ):
        document = doc['document']
        doc_id = doc['digesto']

        # Checks if the document already has the digesto text
        try:
            document_text = DocumentText.objects.get( document = document )
        except ObjectDoesNotExist:
            logger.warn('Getting digesto text: %(doc_type)s %(number)s %(date_st)s' % doc)
        else:
            return

        # Gets the DIGESTO system integral text
        soup = read_soup( digesto_url % doc_id )

        # Parse the text
        # <li class="formatedTextoWithLinks">
        try:
            text = soup.find( 'li', { 'class': 'formatedTextoWithLinks' }
                    ).renderContents()
            text = text.replace('<span>Texto</span>','')
        except AttributeError:
            # No digesto text, abort
            logger.debug('No digesto text.')
            return

        # Save the text to the database
        document_text = DocumentText()
        document_text.document = document
        document_text.text_url = digesto_url % doc_id
        document_text.text = text
        document_text.save()
Пример #6
0
    def check_duplicate( self, doc ):
        # For dates before the site change we should try to verify
        # the document duplication by other means (since the 'claint' changed
        # on the new site
        if doc['date'] < datetime.datetime(2014,9,19):
            # Does the current doc_type have synonyms?
            doc_types = [ doc['doc_type'].lower() ]
            for sn in synonyms:
                if doc['doc_type'].lower() in sn:
                    doc_types = sn

            # Create a query for the synonyms:
            dt_qs = Q( doc_type__iexact = doc_types[0] )
            for dt in doc_types[1:]:
                dt_qs = dt_qs | Q( doc_type__iexact = dt )

            dl = Document.objects.filter(
                    date__exact = doc['date'] ).filter(
                    dt_qs ).filter(
                    number__iexact = doc['number'] ).filter(
                    series__exact = doc['series'] )

            if len(dl) > 1:
                # We have a number of documents that, for a given date, have
                # duplicates with the same number and type. The dates can be
                # listed with:
                # select
                #   count(*), date, doc_type, number
                # from
                #   dreapp_document
                # where
                #   date < '2014-9-18'
                # group by
                #   date, doc_type, number
                # having
                #   count(*) > 1;
                logger.error('Duplicate document in the database: %(doc_type)s %(number)s %(date_st)s' % doc)
                raise DREScraperError('More than one doc with the same number and type.')

            if len(dl) == 1:
                doc['document'] = dl[0]
                raise DREDuplicateError('Duplicate document')

        # For other dates we simply use the db integrity checks to spot a
        # duplicate
        document = doc['document']
        try:
            sid = transaction.savepoint()
            document.save()
            transaction.savepoint_commit(sid)
            logger.debug('ID: %d http://dre.tretas.org/dre/%d/' % (document.id, document.id) )
        except IntegrityError:
            # Duplicated document
            transaction.savepoint_rollback(sid)
            doc['document'] = Document.objects.get(claint = doc['id'] )
            raise DREDuplicateError('Duplicate document')
Пример #7
0
def parse_pdf(doc):
    # Public tenders:
    if (doc.doc_type.lower() == u'Anúncio de Procedimento'.lower() or
            doc.doc_type.lower() == u'Aviso de prorrogação de prazo'.lower() or
            doc.doc_type.lower() == u'Declaração de retificação de anúncio'.lower() or
            doc.doc_type.lower() == u'Anúncio de concurso urgente'.lower()):
        logger.debug('CACHEPDF Tender text extract from pdf for doc id=%d' % doc.id)
        return ParseTenderPdf(doc).run()

    # Generic documents:
    logger.debug('CACHEPDF Generic text extract from pdf for doc id=%d' % doc.id)
    return ParseGenericPdf(doc).run()
Пример #8
0
    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_302(
                        self, req, fp, code, msg, headers)

        result.status = code
        logger.debug('Redirect URL (302): %s' %  result.url)

        ### DRE ugly hack: the dre.pt site instead of issuing an http error code
        ### redirects to an error page. Here we catch the error and raise an
        ### exception.

        if 'Paginas/Erro.aspx' in result.url:
            raise DREError('Error condition on the site')

        return result
Пример #9
0
def save_file(filename, url):
    k = 1
    while True:
        try:
            url, data_blob, cookies = fetch_url( url )
            break
        except urllib2.HTTPError:
            logger.error('Could not read PDF: %s DOC: %s' % ( url, filename))
            k += 1
            if k == MAX_ATTEMPTS:
                raise DREError('Couldn\'t get the PDF: %s' % url )
            logger.debug('Sleeping 2 secs...')
            time.sleep(2)

    with open(filename, 'wb') as f:
        f.write(data_blob)
        f.close()
Пример #10
0
 def read_index(self):
     dr_id_number = self.data['dr_id_number']
     page = 1
     doc_list = []
     sufix = ''
     while True:
         logger.debug('JOURNAL: Read journal page')
         soup = read_soup(JOURNAL_URL % (dr_id_number, page, sufix))
         doc_page = self.get_document_list(soup)
         for doc in doc_page:
             try:
                 yield DREReadDoc(doc,self)
             except DREParseError:
                 pass
         if not doc_page:
             logger.debug('JOURNAL: Empty page')
             if not sufix and self.data['series']==2:
                 page = 0
                 sufix = '?at=c'
             else:
                 break
         page += 1
Пример #11
0
    def save_doc_list(self):
        if not self.doc_list:
            logger.debug('Couldn\'t get documents for %s' % self.date.isoformat())
        for doc in self.doc_list:
            logger.debug('*** Processing document: %(doc_type)s %(number)s %(date_st)s' % doc )
            try:
                self.save_doc( doc )
            except DREDuplicateError:
                logger.debug('We have this document: %(doc_type)s %(number)s %(date_st)s' % doc )
                # Duplicated document: even if the document is duplicated we
                # check for the "digesto" text since sometimes this is created
                # long after the original date of the document.
                if doc['digesto']:
                    # Check the "digesto" integral text
                    self.get_digesto( doc )
                    # Check if the document is in force
                    self.get_in_force_status( doc )
                # In the new dre.pt the doc's pdf url has changed. Because of
                # this even in duplicated documents we update the pdf url.
                if doc['url']:
                    self.update_pdf( doc )
                continue
            except DREScraperError:
                continue

            # Get the "digesto" integral text
            if doc['digesto']:
                self.get_digesto( doc )
            # Check if the document is in force
            if doc['digesto']:
                self.get_in_force_status( doc )
            # Get the pdf version
            if doc['url']:
                self.save_pdf( doc )
            self.create_cache( doc )
            self.log_doc( doc )

            time.sleep(1)
Пример #12
0
def fetch_url( url, data=None, cj=None ):
    # Treat url
    url_object = list(urlparse.urlsplit(url))
    if u'\xba' in url_object[2]:
        url_object[2] = url_object[2].encode('utf-8')
    url_object[2] = urllib.quote(url_object[2])
    url = urlparse.urlunsplit(url_object)

    # Get the payload
    repeat = 1
    while repeat:
        try:
            logger.debug('Getting: %s' % url)
            request = urllib2.Request(url, data)
            request.add_header('Accept-Encoding', 'gzip; q=1.0, identity; q=0.5')
            request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)')
            if not cj:
                cj = cookielib.LWPCookieJar()
            opener = urllib2.build_opener(SmartRedirectHandler(), urllib2.HTTPCookieProcessor(cj) )
            resource = opener.open( request )
            is_gzip = resource.headers.get('Content-Encoding') == 'gzip'

            payload = resource.read()

            url = resource.url

            resource.close()

            if is_gzip:
                try:
                    compressedstream = StringIO.StringIO(payload)
                    gzipper = gzip.GzipFile(fileobj=compressedstream)
                    payload = gzipper.read()
                except IOError:
                    pass

            repeat = False
        except (socket.timeout, socket.error) as e:
            repeat += 1
            if repeat > MAXREPEAT:
                logger.critical('Socket timeout! Aborting')
                raise
            logger.debug('Socket timeout! Sleeping for 5 minutes')
            time.sleep(300)
        except (urllib2.URLError, urllib2.HTTPError) as e:
            msg = str(e)
            repeat += 1
            if repeat > MAXREPEAT:
                logger.critical('HTTP Error! Aborting. Error repeated %d times: %s' % (MAXREPEAT, msg) )
                raise DREError('Error condition on the site')
            if 'Error 400' in str(msg) or 'Error 404' in str(msg):
                logger.critical('HTTP Error 40x - URL: %s' % url)
                raise
            if 'Error 503' in str(msg):
                logger.critical('HTTP Error 503 - cache problem going to try again in 10 seconds.')
                time.sleep(10)
                continue

            logger.warn('HTTP Error! Sleeping for 5 minutes: %s' % msg)
            time.sleep(300)

    t=random.randint(1,5)
    logger.debug('Sleeping %ds' % t)
    time.sleep(t)
    return url, payload, cj
Пример #13
0
 def save_doc(self):
     # Check for document duplication
     doc_obj = self.check_duplicate()
     if self.mode == UPDATE:
         if not self.options['update']:
             logger.debug(msg_doc('IGNORING duplicated document:',
                 self.doc))
             raise DREDuplicateError('Not going to process this doc.')
         else:
             logger.warn(msg_doc('UPDATE mode:', self.doc))
             logger.debug('doc_obj: %s' % doc_obj)
     else:
         logger.warn(msg_doc('NEW mode:', self.doc))
     # Save metadata
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_metadata']):
         logger.debug(msg_doc('Metadata:', self.doc))
         self.save_metadata(doc_obj)
         self.check_forgetme(doc_obj)
     # Save digesto
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_digesto']):
         self.process_digesto(doc_obj)
     # Update cache
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_cache']):
         logger.debug(msg_doc('Cache:', self.doc))
         self.update_cache(doc_obj)
     # Update inforce
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['update_inforce']):
         logger.debug(msg_doc('Update inforce:', self.doc))
         self.update_inforce(doc_obj)
     # Save PDF
     if self.mode==NEW or (self.mode==UPDATE and
                           self.options['save_pdf']):
         logger.debug(msg_doc('Get PDF:', self.doc))
         self.save_pdf(doc_obj)
Пример #14
0
 def update_pdf( self, doc ):
     if doc['url'] and doc['document'].dre_pdf != doc['url']:
         doc['document'].dre_pdf = doc['url']
         doc['document'].save()
         logger.debug('PDF\'s url updated: %(doc_type)s %(number)s %(date_st)s' % doc)