def fetch_docket_by_pacer_case_id( session, court_id, pacer_case_id, fq, ): """Download the docket from PACER and merge it into CL :param session: A PacerSession object to work with :param court_id: The CL ID of the court :param pacer_case_id: The pacer_case_id of the docket, if known :param fq: The PacerFetchQueue object :return: a dict with information about the docket and the new data """ report = DocketReport(map_cl_to_pacer_id(court_id), session) report.query(pacer_case_id, **get_fq_docket_kwargs(fq)) docket_data = report.data if not docket_data: raise ParsingException("No data found in docket report.") if fq.docket_id: d = Docket.objects.get(pk=fq.docket_id) else: d, count = find_docket_object( court_id, pacer_case_id, docket_data["docket_number"] ) if count > 1: d = d.earliest("date_created") rds_created, content_updated = merge_pacer_docket_into_cl_docket( d, pacer_case_id, docket_data, report, appellate=False, ) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def reprocess_docket_data(d, filepath, report_type): """Reprocess docket data that we already have. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys if report_type == DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d.save() add_docket_entries(d, data['docket_entries']) if report_type == DOCKET: add_parties_and_attorneys(d, data['parties']) return d.pk
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.mergers import ( add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, add_bankruptcy_data_to_docket, add_claims_to_docket, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError("The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type) with open(filepath, "r") as f: text = f.read().decode("utf-8") report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def get_docket_json(self): """Download docket to disk from Pacer :return: None """ q = Query() db = TinyDB("db/master.json") fjc_table = db.table("fjc") for row in fjc_table.search(~(q.PACER_CASE_ID == "") & (q.JSON == "False")): rep = DocketReport(row["COURT"], self.s) rep.query( row["PACER_CASE_ID"], show_parties_and_counsel=True, show_terminated_parties=True, show_list_of_member_cases=True, include_pdf_headers=True, show_multiple_docs=False, ) with open( "downloads/json/pacer_docket_%s.json" % row["PACER_CASE_ID"], "w" ) as write_file: json.dump(rep.data, write_file, indent=4, sort_keys=True, default=str) with open( "downloads/html/pacer_docket_%s.html" % row["PACER_CASE_ID"], "w" ) as file: file.write(rep.response.text) fjc_table.update( { "JSON": "True", "pacer_doc_id": rep.data["docket_entries"][0]["pacer_doc_id"], }, doc_ids=[row.doc_id], ) logging.info("Finished collecting JSON and HTML")
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get('docket_entries'): add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def run_parsers_on_path(self, path_root, required_fields=[ 'date_filed', 'case_name', 'docket_number']): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = DocketReport(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data['court_id'], court) # Party-specific tests... for party in data['parties']: self.assertTrue( party.get('name', False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn('----', party['name']) if not os.path.isfile(json_path): bar = "*" * 50 print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar)) with open(json_path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True) #self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual(j['docket_entries'], data['docket_entries']) self.assertEqual(j['parties'], data['parties']) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = pq.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. Note that pacer_case_id is required for Docket # uploads. d = None for kwargs in [{'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number']}, {'pacer_case_id': pq.pacer_case_id}, {'docket_number': docket_data['docket_number'], 'pacer_case_id': None}]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) self.request.callbacks = None return None if d is None: # Couldn't find it. Make a new one. d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries & documents rds_created = [] needs_solr_update = False for docket_entry in docket_data['docket_entries']: try: de, de_created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue if de_created: needs_solr_update = True # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. params = { 'docket_entry': de, # No attachments when uploading dockets. 'document_type': RECAPDocument.PACER_DOCUMENT, 'document_number': docket_entry['document_number'], } try: rd = RECAPDocument.objects.get(**params) except RECAPDocument.DoesNotExist: rd = RECAPDocument.objects.create( pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, **params ) rds_created.append(rd) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def run_parsers_on_path( self, path_root, required_fields=['date_filed', 'case_name', 'docket_number']): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = DocketReport(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data['court_id'], court) # Party-specific tests... for party in data['parties']: self.assertTrue( party.get('name', False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn('----', party['name']) if not os.path.isfile(json_path): bar = "*" * 50 print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar)) with open(json_path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True) #self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual(j['docket_entries'], data['docket_entries']) self.assertEqual(j['parties'], data['parties']) self.assertEqual(j, data) t2 = time.time() max_duration = 1 duration = t2 - t1 if duration > max_duration: if sys.gettrace() is None and not IS_TRAVIS: # Don't do this if we're debugging. raise SlownessException( "The parser for '{fn}' took {duration}s to test, " "which is more than the maximum allowed duration of " "{max_duration}s.".format( fn=filename, duration=duration, max_duration=max_duration, )) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def setUpClass(cls): pacer_session = PacerSession(username='******', password='******') cls.report = DocketReport('psc', pacer_session) cls.pacer_case_id = '62866' # 1:07-cr-00001-RJA-HKS USA v. Green
#!/usr/bin/env python # # Takes an .html file on the command line, parses it using the PACER # Docket Report parser, and outputs json to stdout. import jsondate3 as json import sys from juriscraper.pacer.http import PacerSession from juriscraper.pacer import DocketReport pacer_session = PacerSession(username="******", password="******") report = DocketReport("psc", pacer_session) for path in sys.argv[1:]: with open(path, "r") as f: report._parse_text(f.read().decode("utf-8")) data = report.data print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. param pk: The primary key of the processing queue item you want to work on. """ pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) # Merge the contents of the docket into CL try: d = Docket.objects.get( Q(pacer_case_id=pq.pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=pq.court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def process_docket_data( d: Docket, report_type: int, filepath: str = None, ) -> Optional[int]: """Process docket data file. :param d: A docket object to work on. :param report_type: Whether it's a docket or a docket history report. :param filepath: A local path where the item can be found. If not provided, the filepath_local field of the docket object will be attempted. """ from cl.recap.mergers import ( add_bankruptcy_data_to_docket, add_claims_to_docket, add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError( "The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type ) if filepath: with open(filepath, "r") as f: text = f.read() else: # This is an S3 path, so get it remotely. text = d.filepath_local.read().decode() report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk
def process_recap_docket(pk): pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report.parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) # Merge the contents of the docket into CL try: d = Docket.objects.get( Q(pacer_case_id=pq.pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=pq.court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None update_docket_metadata(d, docket_data) d.save() # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id # Parties for party in docket_data['parties']: try: p = Party.objects.get(name=party['name']) except Party.DoesNotExist: p = Party.objects.create( name=party['name'], extra_info=party['extra_info'], ) except Party.MultipleObjectsReturned: continue else: if party['extra_info']: p.extra_info = party['extra_info'] p.save() # If the party type doesn't exist, make a new one. if not p.party_types.filter(docket=d, name=party['type']).exists(): PartyType.objects.create(docket=d, party=p, name=party['type']) # Attorneys for atty in party.get('attorneys', []): add_attorney(atty, p, d) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() return d
def setUp(self): self.session = get_pacer_session() self.session.login() self.report = DocketReport("cand", self.session) self.pacer_case_id = "186730" # 4:06-cv-07294 Foley v. Bates
class PacerDocketReportTest(unittest.TestCase): """A variety of tests for the docket report""" def setUp(self): self.session = get_pacer_session() self.session.login() self.report = DocketReport("cand", self.session) self.pacer_case_id = "186730" # 4:06-cv-07294 Foley v. Bates @staticmethod def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. :return: The count of the number of rows. """ tree = get_html_parsed_text(html) return len(tree.xpath("//table[./tr/td[3]]/tr")) - 1 # No header row @SKIP_IF_NO_PACER_LOGIN def test_queries(self): """Do a variety of queries work?""" self.report.query(self.pacer_case_id) self.assertIn( "Foley v. Bates", self.report.response.text, msg="Super basic query failed", ) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1)) row_count = self._count_rows(self.report.response.text) self.assertEqual( 2, row_count, msg="Didn't get expected number of " "rows when filtering by start " "date. Got %s." % row_count, ) self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of " "rows when filtering by start and " "end dates. Got %s." % row_count, ) self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of rows " "when filtering by doc number. Got " "%s" % row_count, ) self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), date_range_type="Entered", ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of rows " "when filtering by start and end " "dates and date_range_type of " "Entered. Got %s" % row_count, ) self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=True, ) self.assertIn( "Cheema", self.report.response.text, msg="Didn't find party info when it was explicitly " "requested.", ) self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=False, ) self.assertNotIn( "Cheema", self.report.response.text, msg="Got party info but it was not requested.", ) @SKIP_IF_NO_PACER_LOGIN def test_using_same_report_twice(self): """Do the caches get properly nuked between runs? See issue #187. """ # Query the first one... self.report.query(self.pacer_case_id) d = self.report.data.copy() # Then the second one... second_pacer_case_id = "63111" # 1:07-cv-00035-RJA-HKS Anson v. USA self.report.query(second_pacer_case_id) d2 = self.report.data.copy() self.assertNotEqual( d, d2, msg="Got same values for docket data of two different queries. " "Is there a problem with the caches on the DocketReport?", )
def setUp(self): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() self.report = DocketReport('cand', pacer_session) self.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates
def setUpClass(cls): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) cls.report = DocketReport('cand', pacer_session) cls.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, needs_solr_update = add_docket_entries( d, docket_data['docket_entries'], tag=tag) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if d is not None and first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. try: if d is None: d = Docket.objects.get( Q(pacer_case_id=pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pacer_case_id, court_id=court_id ) except Docket.MultipleObjectsReturned: logger.error("Too many dockets returned when trying to look up '%s.%s'" % (court_id, pacer_case_id)) return None update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s.%s'" % (docket_entry['document_number'], court_id, pacer_case_id) ) continue else: if tag is not None: de.tags.add(tag) try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: rd = RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: # Race condition. The item was created after our get failed. rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry " "number: '%s', docket: %s" % (docket_entry['document_number'], d) ) continue rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id'] if tag is not None: rd.tags.add(tag) add_parties_and_attorneys(d, docket_data['parties']) logger.info("Created/updated docket: %s" % d) return d
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :return: The docket that's created or updated. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. d = None for kwargs in [{ 'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number'] }, { 'pacer_case_id': pq.pacer_case_id }, { 'docket_number': docket_data['docket_number'] }]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None if d is None: # Couldn't find it. Make a new one. d = Docket(source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], }) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: logger.warn( "Creating new document with pacer_doc_id of '%s' violates " "unique constraint on pacer_doc_id field." % docket_entry['pacer_doc_id']) continue except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d
def run_parsers_on_path( self, path_root, required_fields=["date_filed", "case_name", "docket_number"], ): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, "*.html"): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join(dirname, "%s.json" % filename_sans_ext) court = filename_sans_ext.split("_")[0] report = DocketReport(court) with open(path, "rb") as f: report._parse_text(f.read().decode("utf-8")) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data["court_id"], court) # Party-specific tests... for party in data["parties"]: self.assertTrue( party.get("name", False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party, ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn("----", party["name"]) if not os.path.isfile(json_path): bar = "*" * 50 print( "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar) ) with open(json_path, "w") as f: json.dump(data, f, indent=2, sort_keys=True) # self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual( j["docket_entries"], data["docket_entries"] ) self.assertEqual(j["parties"], data["parties"]) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def setUpClass(cls): pacer_session = login('psc', 'tr1234', 'Pass!234') cls.report = DocketReport('psc', pacer_session) cls.pacer_case_id = '62866'
class PacerDocketReportTest(unittest.TestCase): """A variety of tests for the docket report""" def setUp(self): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() self.report = DocketReport('cand', pacer_session) self.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates @staticmethod def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. :return: The count of the number of rows. """ tree = get_html_parsed_text(html) return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1 # No header row @SKIP_IF_NO_PACER_LOGIN def test_queries(self): """Do a variety of queries work?""" self.report.query(self.pacer_case_id) self.assertIn('Foley v. Bates', self.report.response.text, msg="Super basic query failed") self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1)) row_count = self._count_rows(self.report.response.text) self.assertEqual(2, row_count, msg="Didn't get expected number of " "rows when filtering by start " "date. Got %s." % row_count) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28)) row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of " "rows when filtering by start and " "end dates. Got %s." % row_count) self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of rows " "when filtering by doc number. Got " "%s" % row_count) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), date_range_type="Entered") row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of rows " "when filtering by start and end " "dates and date_range_type of " "Entered. Got %s" % row_count) self.report.query(self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=True) self.assertIn('Cheema', self.report.response.text, msg="Didn't find party info when it was explicitly " "requested.") self.report.query(self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=False) self.assertNotIn('Cheema', self.report.response.text, msg="Got party info but it was not requested.") @SKIP_IF_NO_PACER_LOGIN def test_using_same_report_twice(self): """Do the caches get properly nuked between runs? See issue #187. """ # Query the first one... self.report.query(self.pacer_case_id) d = self.report.data.copy() # Then the second one... second_pacer_case_id = '63111' # 1:07-cv-00035-RJA-HKS Anson v. USA self.report.query(second_pacer_case_id) d2 = self.report.data.copy() self.assertNotEqual( d, d2, msg="Got same values for docket data of two different queries. " "Is there a problem with the caches on the DocketReport?" )
#!/usr/bin/env python # # Takes an .html file on the command line, parses it using the PACER # Docket Report parser, and outputs json to stdout. import jsondate as json import sys from juriscraper.pacer.http import PacerSession from juriscraper.pacer import DocketReport pacer_session = PacerSession(username='******', password='******') report = DocketReport('psc', pacer_session) for path in sys.argv[1:]: with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data print json.dumps(data, indent=2, sort_keys=True, separators=(',', ': '))
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. d, count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(pk): pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report.parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) # Merge the contents of the docket into CL try: d = Docket.objects.get( Q(pacer_case_id=pq.pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=pq.court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket(source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None update_docket_metadata(d, docket_data) d.save() # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], }) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue else: rd.pacer_doc_id = rd.pacer_doc_id or docket_entry.pacer_doc_id # Parties for party in docket_data['parties']: try: p = Party.objects.get(name=party['name']) except Party.DoesNotExist: p = Party.objects.create( name=party['name'], extra_info=party['extra_info'], ) except Party.MultipleObjectsReturned: continue else: if party['extra_info']: p.extra_info = party['extra_info'] p.save() # If the party type doesn't exist, make a new one. if not p.party_types.filter(docket=d, name=party['type']).exists(): PartyType.objects.create(docket=d, party=p, name=party['type']) # Attorneys for atty in party.get('attorneys', []): add_attorney(atty, p, d) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() return d