def test_filesize_conversions(self): """Can we convert human filesizes to bytes?""" qa_pairs = [ ('58 kb', 59392), ('117 kb', 119808), ('117kb', 119808), ('1 byte', 1), ('117 bytes', 117), ('117 bytes', 117), (' 117 bytes ', 117), ('117b', 117), ('117bytes', 117), ('1 kilobyte', 1024), ('117 kilobytes', 119808), ('0.7 mb', 734003), ('1mb', 1048576), ('5.2 mb', 5452595), ] for qa in qa_pairs: print("Converting '%s' to bytes..." % qa[0], end='') self.assertEqual(convert_size_to_bytes(qa[0]), qa[1]) print('✓')
def test_filesize_conversions(self): """Can we convert human filesizes to bytes?""" qa_pairs = [ ("58 kb", 59392), ("117 kb", 119808), ("117kb", 119808), ("1 byte", 1), ("117 bytes", 117), ("117 bytes", 117), (" 117 bytes ", 117), ("117b", 117), ("117bytes", 117), ("1 kilobyte", 1024), ("117 kilobytes", 119808), ("0.7 mb", 734003), ("1mb", 1048576), ("5.2 mb", 5452595), ] for qa in qa_pairs: print("Converting '%s' to bytes..." % qa[0], end="") self.assertEqual(convert_size_to_bytes(qa[0]), qa[1]) print("✓")
def test_filesize_conversions(self): """Can we convert human filesizes to bytes?""" qa_pairs = [ ('58 kb', 59392), ('117 kb', 119808), ('117kb', 119808), ('1 byte', 1), ('117 bytes', 117), ('117 bytes', 117), (' 117 bytes ', 117), ('117b', 117), ('117bytes', 117), ('1 kilobyte', 1024), ('117 kilobytes', 119808), ('0.7 mb', 734003), ('1mb', 1048576), ('5.2 mb', 5452595), ] for qa in qa_pairs: print("Converting '%s' to bytes..." % qa[0], end='') self.assertEqual(convert_size_to_bytes(qa[0]), qa[1]) print('✓')
def process_recap_zip(self, pk): """Process a zip uploaded from a PACER district court The general process is to use our existing infrastructure. We open the zip, identify the documents inside, and then associate them with the rest of our collection. :param self: A celery task object :param pk: The PK of the ProcessingQueue object to process :return: A list of new PQ's that were created, one per PDF that was enqueued. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP zip (debug is: %s): %s", pq.debug, pq) with ZipFile(pq.filepath_local.path, "r") as archive: # Security: Check for zip bombs. max_file_size = convert_size_to_bytes("200MB") for zip_info in archive.infolist(): if zip_info.file_size < max_file_size: continue mark_pq_status( pq, "Zip too large; possible zip bomb. File in zip named %s " "would be %s bytes expanded." % (zip_info.filename, zip_info.file_size), PROCESSING_STATUS.INVALID_CONTENT, ) return {"new_pqs": [], "tasks": []} # For each document in the zip, create a new PQ new_pqs = [] tasks = [] for file_name in archive.namelist(): file_content = archive.read(file_name) f = SimpleUploadedFile(file_name, file_content) file_name = file_name.split(".pdf")[0] if "-" in file_name: doc_num, att_num = file_name.split("-") if att_num == "main": att_num = None else: doc_num = file_name att_num = None if att_num: # An attachment, ∴ nuke the pacer_doc_id value, since it # corresponds to the main doc only. pacer_doc_id = "" else: pacer_doc_id = pq.pacer_doc_id # Create a new PQ and enqueue it for processing new_pq = ProcessingQueue.objects.create( court=pq.court, uploader=pq.uploader, pacer_case_id=pq.pacer_case_id, pacer_doc_id=pacer_doc_id, document_number=doc_num, attachment_number=att_num, filepath_local=f, status=PROCESSING_STATUS.ENQUEUED, upload_type=UPLOAD_TYPE.PDF, debug=pq.debug, ) new_pqs.append(new_pq.pk) tasks.append(process_recap_pdf.delay(new_pq.pk)) # At the end, mark the pq as successful and return the PQ mark_pq_status( pq, "Successfully created ProcessingQueue objects: %s" % oxford_join(new_pqs), PROCESSING_STATUS.SUCCESSFUL, ) # Returning the tasks allows tests to wait() for the PDFs to complete # before checking assertions. return { "new_pqs": new_pqs, "tasks": tasks, }
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def merge_attachment_page_data( court: Court, pacer_case_id: int, pacer_doc_id: int, document_number: int, text: str, attachment_dicts: List[Dict[str, Union[int, str]]], debug: bool = False, ) -> Tuple[List[RECAPDocument], DocketEntry]: """Merge attachment page data into the docket :param court: The court object we're working with :param pacer_case_id: A PACER case ID :param pacer_doc_id: A PACER document ID :param document_number: The docket entry number :param text: The text of the attachment page :param attachment_dicts: A list of Juriscraper-parsed dicts for each attachment. :param debug: Whether to do saves during this process. :return: A list of RECAPDocuments modified or created during the process, and the DocketEntry object associated with the RECAPDocuments :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist """ try: params = { "pacer_doc_id": pacer_doc_id, "docket_entry__docket__court": court, } if pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned as exc: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. raise exc except RECAPDocument.DoesNotExist as exc: # Can't find the docket to associate with the attachment metadata # It may be possible to go look for orphaned documents at this stage # and to then add them here, as we do when adding dockets. This need is # particularly acute for those that get free look emails and then go to # the attachment page. raise exc # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if document_number is None: # Bankruptcy attachment page. Use the document number from the Main doc document_number = main_rd.document_number if debug: return [], de # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. rds_created = [] rds_affected = [] for attachment in attachment_dicts: sanity_checks = [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] if not all(sanity_checks): continue rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=document_number, attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) rds_affected.append(rd) for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass rd.save() # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_ia_upload_needed(de.docket, save_docket=True) process_orphan_documents( rds_created, court.pk, main_rd.docket_entry.docket.date_filed ) return rds_affected, de