def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id') try: recap_doc = RECAPDocument.objects.get( pacer_doc_id=pacer_document_id) except RECAPDocument.DoesNotExist: recap_doc = RECAPDocument(pacer_doc_id=pacer_document_id, docket_entry=docket_entry) recap_doc.date_upload = self.get_datetime_from_node( doc_node, 'upload_date') recap_doc.document_type = document_type or recap_doc.document_type recap_doc.document_number = entry_number or recap_doc.document_number # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') recap_doc.is_available = False if availability is None else availability recap_doc.sha1 = self.get_str_from_node(doc_node, 'sha1') recap_doc.description = (self.get_str_from_node( doc_node, 'short_desc') or recap_doc.description) if recap_doc.is_available: recap_doc.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) recap_doc.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if document_type == RECAPDocument.ATTACHMENT: recap_doc.attachment_number = attachment_number if not debug: recap_doc.save() return recap_doc
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id') try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: rd = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, document_number=entry_number, ) else: rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') rd.document_type = document_type or rd.document_type # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') rd.is_available = False if availability is None else availability rd.sha1 = self.get_str_from_node(doc_node, 'sha1') rd.description = (self.get_str_from_node(doc_node, 'short_desc') or rd.description) if rd.is_available: rd.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) rd.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if rd.page_count is None: extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: rd.attachment_number = attachment_number if not debug: try: rd.save(do_extraction=False, index=False) except IntegrityError as e: # This happens when a pacer_doc_id has been wrongly set as # the document_number, see for example, document 19 and # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml logger.error( "Unable to create RECAPDocument for document #%s, " "attachment #%s on entry: %s due to " "IntegrityError." % (rd.document_number, rd.attachment_number, rd.docket_entry)) return None return rd
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node( doc_node, 'pacer_doc_id') try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: rd = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, document_number=entry_number, ) else: rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') rd.document_type = document_type or rd.document_type if not rd.is_available: # If we can't parse the availability node (it returns None), # default it to False. availability = self.get_bool_from_node(doc_node, 'available') rd.is_available = False if availability is None else availability if not rd.sha1: rd.sha1 = self.get_str_from_node(doc_node, 'sha1') rd.description = (self.get_str_from_node(doc_node, 'short_desc') or rd.description) if rd.is_available: rd.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) rd.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if rd.page_count is None: extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: rd.attachment_number = attachment_number if not debug: rd.save(do_extraction=False, index=False) return rd
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node( doc_node, 'pacer_doc_id') try: recap_doc = RECAPDocument.objects.get( pacer_doc_id=pacer_document_id ) except RECAPDocument.DoesNotExist: recap_doc = RECAPDocument( pacer_doc_id=pacer_document_id, docket_entry=docket_entry ) recap_doc.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') recap_doc.document_type = document_type or recap_doc.document_type if isinstance(entry_number, int): recap_doc.document_number = entry_number # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') recap_doc.is_available = False if availability is None else availability recap_doc.sha1 = self.get_str_from_node(doc_node, 'sha1') recap_doc.description = ( self.get_str_from_node(doc_node, 'short_desc') or recap_doc.description ) if recap_doc.is_available: recap_doc.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) recap_doc.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if document_type == RECAPDocument.ATTACHMENT: recap_doc.attachment_number = attachment_number if not debug: recap_doc.save() return recap_doc
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get( pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning( "Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq ) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get( docket=d, entry_number=pq.document_number ) except DocketEntry.DoesNotExist as exc: logger.warning( "Unable to find docket entry for processing " "queue '%s'." % pq ) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all( [ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path), ] ) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split(".")[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None if not pq.debug: rd.save() if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path)]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd