예제 #1
0
    def check_duplicate( self, doc ):
        # For dates before the site change we should try to verify
        # the document duplication by other means (since the 'claint' changed
        # on the new site
        if doc['date'] < datetime.datetime(2014,9,19):
            # Does the current doc_type have synonyms?
            doc_types = [ doc['doc_type'].lower() ]
            for sn in synonyms:
                if doc['doc_type'].lower() in sn:
                    doc_types = sn

            # Create a query for the synonyms:
            dt_qs = Q( doc_type__iexact = doc_types[0] )
            for dt in doc_types[1:]:
                dt_qs = dt_qs | Q( doc_type__iexact = dt )

            dl = Document.objects.filter(
                    date__exact = doc['date'] ).filter(
                    dt_qs ).filter(
                    number__iexact = doc['number'] ).filter(
                    series__exact = doc['series'] )

            if len(dl) > 1:
                # We have a number of documents that, for a given date, have
                # duplicates with the same number and type. The dates can be
                # listed with:
                # select
                #   count(*), date, doc_type, number
                # from
                #   dreapp_document
                # where
                #   date < '2014-9-18'
                # group by
                #   date, doc_type, number
                # having
                #   count(*) > 1;
                logger.error('Duplicate document in the database: %(doc_type)s %(number)s %(date_st)s' % doc)
                raise DREScraperError('More than one doc with the same number and type.')

            if len(dl) == 1:
                doc['document'] = dl[0]
                raise DREDuplicateError('Duplicate document')

        # For other dates we simply use the db integrity checks to spot a
        # duplicate
        document = doc['document']
        try:
            sid = transaction.savepoint()
            document.save()
            transaction.savepoint_commit(sid)
            logger.debug('ID: %d http://dre.tretas.org/dre/%d/' % (document.id, document.id) )
        except IntegrityError:
            # Duplicated document
            transaction.savepoint_rollback(sid)
            doc['document'] = Document.objects.get(claint = doc['id'] )
            raise DREDuplicateError('Duplicate document')
예제 #2
0
파일: __init__.py 프로젝트: heldergg/dre
 def get_html(self):
     allowed_areas = (
             ( 45, 40, 295, 780),
             (297, 40, 550, 780),
             )
     txt = convert_pdf_to_txt_layout(self.filename, allowed_areas)
     txt = self.cut_out_doc(txt)
     if not txt:
         logger.error('CACHEPDF No text for doc id=%d', self.doc.id)
     return parse_generic_document.run(txt)
예제 #3
0
def save_file(filename, url):
    k = 1
    while True:
        try:
            url, data_blob, cookies = fetch_url( url )
            break
        except urllib2.HTTPError:
            logger.error('Could not read PDF: %s DOC: %s' % ( url, filename))
            k += 1
            if k == MAX_ATTEMPTS:
                raise DREError('Couldn\'t get the PDF: %s' % url )
            logger.debug('Sleeping 2 secs...')
            time.sleep(2)

    with open(filename, 'wb') as f:
        f.write(data_blob)
        f.close()