def test_issue_729_url_coalescing(self): """Are URL's coalesced properly?""" # Save a docket to the backend using coalescing d = Docket.objects.create( source=Docket.RECAP, docket_number='asdf', pacer_case_id='asdf', court_id='test', ) de = DocketEntry.objects.create( docket=d, entry_number=1, ) rd1 = RECAPDocument.objects.create( docket_entry=de, document_type=RECAPDocument.PACER_DOCUMENT, document_number='1', pacer_doc_id='1', ) rd2 = RECAPDocument.objects.create( docket_entry=de, document_type=RECAPDocument.ATTACHMENT, document_number='1', attachment_number=1, pacer_doc_id='2', ) # Do the absolute URLs differ when pulled from the DB? self.assertNotEqual(rd1.get_absolute_url(), rd2.get_absolute_url()) add_or_update_recap_document([rd1.pk, rd2.pk], coalesce_docket=True, force_commit=True) # Do the absolute URLs differ when pulled from Solr? r1 = self.si_recap.get(rd1.pk) r2 = self.si_recap.get(rd2.pk) self.assertNotEqual( r1.result.docs[0]['absolute_url'], r2.result.docs[0]['absolute_url'], )
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get( pacer_doc_id=pq.pacer_doc_id, ) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, error_message, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket entry, but # were unable to find the document by pacer_doc_id. This happens # when pacer_doc_id is missing, for example. ∴, try to get the # document from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 if not pq.debug: try: rd.save() except IntegrityError: msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if pq.pacer_case_id in ['undefined', 'null']: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get('pacer_case_id') pq.save() # Merge the contents of the data into CL. try: params = { 'pacer_doc_id': att_data['pacer_doc_id'], 'docket_entry__docket__court': pq.court, } if pq.pacer_case_id: params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = "Too many documents found when attempting to associate " \ "attachment data" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de, upload_type=ATTACHMENT_PAGE) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description']]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: rd.save() # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents(rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed)
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, session, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param session: The PACER session object to use. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return d = rd.docket_entry.docket pacer_court_id = map_cl_to_pacer_id(d.court_id) att_report = AttachmentPage(pacer_court_id, session) try: att_report.query(rd.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) rd.tags.add(tag) if rd.is_available: # Great. Call it a day. rd.save(do_extraction=False, index=False) return # Not available. Go get it. try: pacer_case_id = rd.docket_entry.docket.pacer_case_id r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id']) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id']) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \ (rd, pacer_court_id, rd.pacer_doc_id) logger.error(msg) self.request.callbacks = None return file_name = get_document_filename( d.court_id, pacer_case_id, rd.document_number, rd.attachment_number, ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, force it all to be # bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save, extract, then save to Solr. Skip OCR for now. Don't do these async. rd.save(do_extraction=False, index=False) extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None if not pq.debug: rd.save() if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, cookies, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) att_report = get_attachment_page_by_rd(self, rd_pk, cookies) att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. # Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) tag.tag_object(rd) if rd.is_available: # Great. Call it a day. rd.save() return pacer_case_id = rd.docket_entry.docket.pacer_case_id r = download_pacer_pdf_by_rd(rd.pk, pacer_case_id, att_found['pacer_doc_id'], cookies) court_id = rd.docket_entry.docket.court_id success, msg = update_rd_metadata(self, rd_pk, r, court_id, pacer_case_id, rd.pacer_doc_id, rd.document_number, rd.attachment_number) if success is False: return # Skip OCR for now. It'll happen in a second step. extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path)]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) # Merge the contents of the data into CL. try: rd = RECAPDocument.objects.get( pacer_doc_id=att_data['pacer_doc_id'], docket_entry__docket__court=pq.court, ) except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = rd.document_number if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([ attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description'] ]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: try: rd.save() except IntegrityError: # Happens when we hit courtlistener/issues#765, in which # we violate the unique constraint on pacer_doc_id. continue # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)