def check_duplicate( self, doc ): # For dates before the site change we should try to verify # the document duplication by other means (since the 'claint' changed # on the new site if doc['date'] < datetime.datetime(2014,9,19): # Does the current doc_type have synonyms? doc_types = [ doc['doc_type'].lower() ] for sn in synonyms: if doc['doc_type'].lower() in sn: doc_types = sn # Create a query for the synonyms: dt_qs = Q( doc_type__iexact = doc_types[0] ) for dt in doc_types[1:]: dt_qs = dt_qs | Q( doc_type__iexact = dt ) dl = Document.objects.filter( date__exact = doc['date'] ).filter( dt_qs ).filter( number__iexact = doc['number'] ).filter( series__exact = doc['series'] ) if len(dl) > 1: # We have a number of documents that, for a given date, have # duplicates with the same number and type. The dates can be # listed with: # select # count(*), date, doc_type, number # from # dreapp_document # where # date < '2014-9-18' # group by # date, doc_type, number # having # count(*) > 1; logger.error('Duplicate document in the database: %(doc_type)s %(number)s %(date_st)s' % doc) raise DREScraperError('More than one doc with the same number and type.') if len(dl) == 1: doc['document'] = dl[0] raise DREDuplicateError('Duplicate document') # For other dates we simply use the db integrity checks to spot a # duplicate document = doc['document'] try: sid = transaction.savepoint() document.save() transaction.savepoint_commit(sid) logger.debug('ID: %d http://dre.tretas.org/dre/%d/' % (document.id, document.id) ) except IntegrityError: # Duplicated document transaction.savepoint_rollback(sid) doc['document'] = Document.objects.get(claint = doc['id'] ) raise DREDuplicateError('Duplicate document')
def get_html(self): allowed_areas = ( ( 45, 40, 295, 780), (297, 40, 550, 780), ) txt = convert_pdf_to_txt_layout(self.filename, allowed_areas) txt = self.cut_out_doc(txt) if not txt: logger.error('CACHEPDF No text for doc id=%d', self.doc.id) return parse_generic_document.run(txt)
def save_file(filename, url): k = 1 while True: try: url, data_blob, cookies = fetch_url( url ) break except urllib2.HTTPError: logger.error('Could not read PDF: %s DOC: %s' % ( url, filename)) k += 1 if k == MAX_ATTEMPTS: raise DREError('Couldn\'t get the PDF: %s' % url ) logger.debug('Sleeping 2 secs...') time.sleep(2) with open(filename, 'wb') as f: f.write(data_blob) f.close()