def get_cover_sheets_for_docket(options, docket_pks, tag=None): """Get civil cover sheets for dockets in our system.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) cover_sheet_re = re.compile(r'cover\s*sheet', re.IGNORECASE) for i, docket_pk in enumerate(docket_pks): throttle.maybe_wait() if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) try: rd_pk = RECAPDocument.objects.get( document_number=1, docket_entry__docket_id=docket_pk, ).values_list() except (RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist) as e: logger.warn("Unable to get document 1 for docket_pk: %s" % docket_pk) else: get_pacer_doc_by_rd_and_description.apply_async( args=( rd_pk, cover_sheet_re, pacer_session, ), kwargs={ 'tag': tag, }, queue=q, )
def get_pacer_dockets(options, row_pks, tag=None): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, row_pk in enumerate(row_pks): throttle.maybe_wait() if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) row = FjcIntegratedDatabase.objects.get(pk=row_pk) get_docket_by_pacer_case_id.apply_async( args=( row.pacer_case_id, map_cl_to_pacer_id(row.district_id), pacer_session, ), kwargs={ 'tag': tag, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, }, queue=q, )
def get_pacer_dockets(options, docket_pks, tags): """Get the pacer dockets identified by the FJC IDB rows""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info(f"Sent {i} tasks to celery so far.") d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s( {"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk}, d.court_id, cookies=pacer_session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def test_logging_short_username(self): """If a username shorter than six characters is provided, do we throw an appropriate exception? """ session = PacerSession(username="******", password="******") with self.assertRaises(PacerLoginException): session.login()
def get_pacer_dockets(options, docket_pks, tag): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s({ 'pacer_case_id': d.pacer_case_id }, d.court_id, cookies=pacer_session.cookies, **{ 'tag_names': [tag], 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def test_logging_short_password(self): """If a short password is provided, do we throw an appropriate exception? """ session = PacerSession(username="******", password="******") with self.assertRaises(PacerLoginException): session.login()
def get_pacer_doc_ids(options): """Get pacer_doc_ids for any item that needs them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) row_pks = RECAPDocument.objects.filter(pacer_doc_id=None, ).exclude( document_number=None, ).exclude( docket_entry__docket__pacer_case_id=None).exclude( docket_entry__docket__court__jurisdiction__in=Court. BANKRUPTCY_JURISDICTIONS, ).order_by('pk').values_list( 'pk', flat=True) completed = 0 for row_pk in row_pks: if completed >= options['count'] > 0: break if row_pk < options['start_pk'] > 0: continue throttle.maybe_wait() if completed % 1000 == 0: session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() logger.info("Sent %s tasks to celery so far. Latest pk: %s" % (completed, row_pk)) get_pacer_doc_id_with_show_case_doc_url.apply_async( args=(row_pk, session), queue=q, ) completed += 1
def get_pacer_dockets(options, row_pks, tag=None): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, row_pk in enumerate(row_pks): if i >= options['count'] > 0: break throttle.maybe_wait() if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) row = FjcIntegratedDatabase.objects.get(pk=row_pk) chain( get_docket_by_pacer_case_id.s( row.pacer_case_id, row.district_id, pacer_session, **{'tag': tag, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True} ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_doc_by_re_and_de_nums_for_dockets(options, docket_pks, regex, de_nums, fallback=False, tag=None): """Get civil cover sheets for dockets in our system. :param options: The options sent on the command line as a dict. :param docket_pks: A list of docket pks to iterate over. :param regex: A regex to match on the document description on the attachment page. For example, to get initial complaints, set this to r'initial\s*complaints'. :param de_nums: The docket entry numbers to use when looking for items, as a list. :param fallback: After loading the attachment page, if we don't find something that matches `regex`, should we just grab the main document? :param tag: A tag to add to any modified content. """ q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, docket_pk in enumerate(docket_pks): if i >= options['count'] > 0: break throttle.maybe_wait() if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) try: rds = RECAPDocument.objects.filter( document_number__in=de_nums, document_type=RECAPDocument.PACER_DOCUMENT, docket_entry__docket_id=docket_pk, ) except (RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist): logger.warn("Unable to get document 1 for docket_pk: %s" % docket_pk) else: for rd in rds: get_pacer_doc_by_rd_and_description.apply_async( args=( rd.pk, regex, pacer_session, ), kwargs={ 'fallback_to_main_doc': fallback, 'tag': tag, }, queue=q, )
def test_logging_into_pacer(self): try: session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() self.assertIsNotNone(session) self.assertIsNotNone(session.cookies.get( 'PacerSession', None, domain='.uscourts.gov', path='/')) except PacerLoginException: self.fail('Could not log into PACER')
def get_pacer_case_ids(options, row_pks): """Get the PACER case IDs for the given items.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, row_pk in enumerate(row_pks): throttle.maybe_wait() if i % 10000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) get_pacer_case_id_for_idb_row.apply_async(args=(row_pk, pacer_session), queue=q)
def get_pacer_case_ids(options, row_pks): """Get the PACER case IDs for an item in the IDB by looking it up on PACER""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, row_pk in enumerate(row_pks): if i >= options['count'] > 0: break throttle.maybe_wait() if i % 10000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) get_pacer_case_id_for_idb_row.apply_async( args=(row_pk, pacer_session.cookies), queue=q, )
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options["queue"] index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( "Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name) )
def setUp(self): pacer_session = PacerSession() if pacer_credentials_are_defined(): # CAND chosen at random pacer_session = get_pacer_session() pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT_EXAMPLES_PACER, 'dates/valid_free_opinion_dates.json') with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court['court_link']) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def setUp(self): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json') with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court['court_link']) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def test_logging_in_bad_credentials(self): # Make sure password is more than eight characters. session = PacerSession(username="******", password="******") with self.assertRaises(PacerLoginException): session.login()
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. """ # Kill any *old* logs that report they're in progress. (They've failed.) twelve_hrs_ago = now() - timedelta(hours=12) PACERFreeDocumentLog.objects.filter( date_started__lt=twelve_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = Court.objects.filter( jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY], in_use=True, end_date=None, ).exclude(pk__in=[ 'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb', 'prb', 'tnwb', 'vib' ], ).values_list( 'pk', flat=True, ) pacer_court_ids = { map_cl_to_pacer_id(v): { 'until': now(), 'count': 1, 'result': None } for v in cl_court_ids } pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() # Iterate over every court, X days at a time. As courts are completed, # remove them from the list of courts to process until none are left tomorrow = now() + timedelta(days=1) while len(pacer_court_ids) > 0: court_ids_copy = pacer_court_ids.copy() # Make a copy of the list. for pacer_court_id, delay in court_ids_copy.items(): if now() < delay['until']: # Do other courts until the delay is up. Do not print/log # anything since at the end there will only be one court left. continue next_start_date, next_end_date = get_next_date_range( pacer_court_id) if delay['result'] is not None: if delay['result'].ready(): result = delay['result'].get() if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_start_date >= tomorrow.date(): logger.info("Finished '%s'. Marking it complete." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue else: next_delay = min(delay['count'] * 5, 30) # backoff w/cap logger.info( "Court %s still in progress. Delaying at least " "%ss." % (pacer_court_id, next_delay)) pacer_court_ids[pacer_court_id]['until'] = now( ) + timedelta(seconds=next_delay) pacer_court_ids[pacer_court_id]['count'] += 1 continue mark_court_in_progress(pacer_court_id, next_end_date) pacer_court_ids[pacer_court_id]['count'] = 1 # Reset delay['result'] = chain( get_and_save_free_document_report.si(pacer_court_id, next_start_date, next_end_date, pacer_session), mark_court_done_on_date.s(pacer_court_id, next_end_date), ).apply_async()
def test_logging_in_bad_credentials(self): session = PacerSession(username='******', password='******') with self.assertRaises(PacerLoginException): session.login()
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. For now just get the list. Note that this uses synchronous celery chains. A previous version was more complex and did not use synchronous chains. Unfortunately in Celery 4.2.0, or more accurately in redis-py 3.x.x, doing it that way failed nearly every time. This is a simpler version, though a slower one, but it should get the job done. """ # Kill any *old* logs that report they're in progress. (They've failed.) three_hrs_ago = now() - timedelta(hours=3) PACERFreeDocumentLog.objects.filter( date_started__lt=three_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = Court.objects.filter( jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY], in_use=True, end_date=None, ).exclude(pk__in=['casb', 'gub', 'innb', 'miwb', 'ohsb', 'prb'], ).values_list( 'pk', flat=True, ) pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids] pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() today = now() for pacer_court_id in pacer_court_ids: while True: next_start_d, next_end_d = get_next_date_range(pacer_court_id) logger.info( "Attempting to get latest document references for " "%s between %s and %s", pacer_court_id, next_start_d, next_end_d) mark_court_in_progress(pacer_court_id, next_end_d) try: status = get_and_save_free_document_report( pacer_court_id, next_start_d, next_end_d, pacer_session.cookies) except RequestException: logger.error( "Failed to get document references for %s " "between %s and %s due to network error.", pacer_court_id, next_start_d, next_end_d) mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d) break except IndexError: logger.error( "Failed to get document references for %s " "between %s and %s due to PACER 6.3 bug.", pacer_court_id, next_start_d, next_end_d) mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d) break else: result = mark_court_done_on_date(status, pacer_court_id, next_end_d) if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_end_d >= today.date(): logger.info("Got all document references for '%s'.", pacer_court_id) # Break from while loop, onwards to next court break else: # More dates to do; let it continue continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) # Break from while loop, onwards to next court break
def setUp(self): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() self.report = DocketReport('cand', pacer_session) self.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates