def get_attachment_pages(options, tag): rd_pks = RECAPDocument.objects.filter( tags__name=tag, docket_entry__description__icontains='attachment', ).values_list('pk', flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages(options=options, rd_pks=rd_pks, tag_names=[tag], session=session)
def get_attachment_pages(options, tag): rd_pks = RECAPDocument.objects.filter( tags__name=tag, docket_entry__description__icontains="attachment").values_list( "pk", flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages(options=options, rd_pks=rd_pks, tag_names=[tag], session=session)
def get_att_pages(options): rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list('pk', flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages(options=options, rd_pks=rd_pks, tag_names=[TAG], session=session)
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_or_update_recap_document.si([rd.pk]).set(queue=q), ).apply_async() i += 1
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options['input_file'], 'r') as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row['idb_docket_number']: if task == 'download_student_dockets': continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row['idb_docket_number'].rjust(7, '0') elif row['student_docket_number']: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row['student_docket_number'] else: # No docket number; move on. continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row['Case Name'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_att_pages(options): rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list("pk", flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[TAG], session=session )
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info( "Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"] ) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set( queue=q ), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def get_dockets(options): """Download a sample of dockets from PACER matching the 7xx series of NOS codes. """ nos_codes = [ LABOR_LITIGATION_OTHER, LABOR_MANAGEMENT_RELATIONS_ACT, LABOR_MANAGEMENT_REPORT_DISCLOSURE, FAIR_LABOR_STANDARDS_ACT_CV, RAILWAY_LABOR_ACT, FAMILY_AND_MEDICAL_LEAVE_ACT, EMPLOYEE_RETIREMENT_INCOME_SECURITY_ACT ] sample_size = 300 items = FjcIntegratedDatabase.objects.filter( nature_of_suit__in=nos_codes, date_terminated__gt='2009-01-01', date_terminated__lt='2018-10-15', date_filed__gt='2009-01-01').order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) logger.info("This case is from year: %s", row.date_filed.year) throttle.maybe_wait() case_name = '%s v. %s' % (row.plaintiff, row.defendant) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, case_name=case_name, ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row.district_id, cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by("?")[:sample_size] q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params, ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, "doc_num_end": doc_num_end, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def add_all_nysd_to_cl(options): """Alas, there's only one way to get all the cases about a particular judge: Get all the cases in the entire jurisdiction. We do that here using the iquery.pl endpoint. Once added to the DB we'll ensure they're tagged. In the next step, we'll download all the tagged items. """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # IDs obtained by binary search of docket numbers on PACER website. earliest_id = 405990 latest_id = 543051 for pacer_case_id in range(earliest_id, latest_id): if pacer_case_id < options["skip_until"]: continue if pacer_case_id >= options["limit"] > 0: break if pacer_case_id % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Doing pacer_case_id: %s", pacer_case_id) make_docket_by_iquery.apply_async( args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]), queue=q, )
def get_final_docs(options): """Get any documents that contain "final" in their description.""" des = (DocketEntry.objects.filter( tags__name=TAG, description__icontains="final").order_by("pk").iterator()) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, de in enumerate(des): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) rd_pks = (de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id="").order_by("pk").values_list("pk", flat=True)) for rd_pk in rd_pks: throttle.maybe_wait() chain( get_pacer_doc_by_rd.s(rd_pk, pacer_session.cookies, tag=TAG_FINALS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_attachment_page_by_rd(self, rd_pk, cookies): """Get the attachment page for the item in PACER. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-on PACER user. :return: The attachment report populated with the results """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return s = PacerSession(cookies=cookies) pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) att_report = AttachmentPage(pacer_court_id, s) try: att_report.query(rd.pacer_doc_id) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying.", exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." logger.error(msg, exc.response.status_code) self.request.callbacks = None return except requests.RequestException as exc: logger.warning("Unable to get attachment page for %s", rd) raise self.retry(exc=exc) return att_report
def update_docket_info_iquery(self, d_pk): cookies = get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = PacerSession( cookies=cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) d = Docket.objects.get(pk=d_pk) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) try: report.query(d.pacer_case_id) except (requests.Timeout, requests.RequestException) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." ) if self.request.retries == self.max_retries: return raise self.retry(exc=exc) d = update_docket_metadata(d, report.data) d.save() add_bankruptcy_data_to_docket(d, report.data) add_items_to_solr([d.pk], "search.Docket")
def get_dockets(options, items, tags, sample_size=0): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) recipients = options['recipients'].split(',') print("Recipients list is: %s" % recipients) s = PacerSession(username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD) s.login() report = CaseQueryAdvancedBankruptcy('canb', s) t1 = now() while True: query = 'Pacific' report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) query = 'PG&E' report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) time.sleep(options['sleep']) t2 = now() min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") s.login() t1 = now()
def get_and_save_free_document_report(self, court_id, start, end, cookies): """Download the Free document report and save it to the DB. :param self: The Celery task. :param court_id: A pacer court id. :param start: a date object representing the first day to get results. :param end: a date object representing the last day to get results. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :return: None """ s = PacerSession(cookies=cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD) report = FreeOpinionReport(court_id, s) try: report.query(start, end, sort='case_number') except (ConnectionError, ChunkedEncodingError, ReadTimeoutError, ReadTimeout, ConnectTimeout) as exc: logger.warning("Unable to get free document report results from %s " "(%s to %s). Trying again." % (court_id, start, end)) if self.request.retries == self.max_retries: return PACERFreeDocumentLog.SCRAPE_FAILED raise self.retry(exc=exc, countdown=5) except SoftTimeLimitExceeded as exc: logger.warning("Soft time limit exceeded at %s. %s retries remain.", court_id, (self.max_retries - self.request.retries)) if self.request.retries == self.max_retries: return PACERFreeDocumentLog.SCRAPE_FAILED raise self.retry(exc=exc, countdown=5) try: results = report.data except (IndexError, HTTPError) as exc: # IndexError: When the page isn't downloaded properly. # HTTPError: raise_for_status in parse hit bad status. if self.request.retries == self.max_retries: return PACERFreeDocumentLog.SCRAPE_FAILED raise self.retry(exc=exc, countdown=5) for row in results: PACERFreeDocumentRow.objects.create( court_id=row.court_id, pacer_case_id=row.pacer_case_id, docket_number=row.docket_number, case_name=row.case_name, date_filed=row.date_filed, pacer_doc_id=row.pacer_doc_id, document_number=row.document_number, description=row.description, nature_of_suit=row.nature_of_suit, cause=row.cause, ) return PACERFreeDocumentLog.SCRAPE_SUCCESSFUL
def get_pacer_case_id_and_title(self, docket_number, court_id, cookies, case_name=None, office_number=None, docket_number_letters=None, ): """Get the pacer_case_id and title values for a district court docket. Use heuristics to disambiguate the results. office_number and docket_number_letters are only needed when they are not already part of the docket_number passed in. Multiple parameters are needed here to allow flexibility when using this API. Some sources, like the IDB, have this data all separated out, so it helps not to try to recreate docket numbers from data that comes all pulled apart. :param docket_number: The docket number to look up. This is a flexible field that accepts a variety of docket number styles. :param court_id: The CourtListener court ID for the docket number :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param case_name: The case name to use for disambiguation :param office_number: The number (or letter) where the case took place. Typically, this is in the beginning of the docket number before the colon. This will be used for disambiguation. If you passed it as part of the docket number, it is not needed here. :param docket_number_letters: These are the letters, (cv, cr, md, etc.) that may appear in a docket number. This is used for disambiguation. If you passed these letters in the docket number, you do not need to pass these letters again here. :return: The dict formed by the PossibleCaseNumberApi lookup if a good value is identified, else None. The dict takes the form of: { u'docket_number': force_unicode(node.xpath('./@number')[0]), u'pacer_case_id': force_unicode(node.xpath('./@id')[0]), u'title': force_unicode(node.xpath('./@title')[0]), } """ logger.info("Getting pacer_case_id for docket_number %s in court %s", docket_number, court_id) s = PacerSession(cookies=cookies) report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) try: report.query(docket_number) except requests.RequestException as exc: logger.warning("RequestException while running possible case number " "query. Trying again if retries not exceeded: %s.%s", court_id, docket_number) if self.request.retries == self.max_retries: self.request.callbacks = None return None raise self.retry(exc=exc) try: return report.data(case_name=case_name, office_number=office_number, docket_number_letters=docket_number_letters) except ParsingException: return None
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False, 'doc_num_end': doc_num_end, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_pacer_doc_id_with_show_case_doc_url(self, rd_pk, cookies): """use the show_case_doc URL to get pacer_doc_id values. :param rd_pk: The pk of the RECAPDocument you want to get. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. """ rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket s = PacerSession(cookies=cookies) pacer_court_id = map_cl_to_pacer_id(d.court_id) report = ShowCaseDocApi(pacer_court_id, s) last_try = (self.request.retries == self.max_retries) try: if rd.document_type == rd.ATTACHMENT: report.query(d.pacer_case_id, rd.document_number, rd.attachment_number) else: report.query(d.pacer_case_id, rd.document_number) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) if last_try: return else: raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: if last_try: logger.error("Ran into repeated HTTPErrors. No more retries. " "Aborting.") return else: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) return try: pacer_doc_id = report.data except ParsingException: logger.error("Unable to get redirect for %s" % rd) return else: rd.pacer_doc_id = pacer_doc_id rd.save() logger.info("Successfully saved pacer_doc_id to rd %s" % rd_pk)
def get_dockets(options): """Get the dockets by the particular judge now that we have run iquery for all of the cases in the jurisdiction, and now that we have """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() buchwald_id = 450 ds = ( Docket.objects.filter( court_id="nysd", assigned_to_id=buchwald_id, tags__name=NYSD_TAG ) .exclude(idb_data__nature_of_suit__in=NOS_EXCLUSIONS) .exclude(idb_data__isnull=True) ) logger.info("Got %s dockets to download", ds.count()) for i, d in enumerate(ds): if i < options["skip_until"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() throttle.maybe_wait() logger.info("%s: Doing docket with pk: %s", i, d.pk) chain( get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, cookies=session.cookies, docket_pk=d.pk, tag_names=[BUCKWALD_TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV """ f = options["file"] reader = csv.DictReader(f) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row["filecy"], row["docket"]), court_id="ilnb", cookies=pacer_session.cookies, office_number=row["office"], docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id="ilnb", cookies=pacer_session.cookies, tag_names=[TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = ( RECAPDocument.objects.filter( tags__name=TAG, document_number="1", document_type=RECAPDocument.PACER_DOCUMENT, ) .exclude(pacer_doc_id="",) .order_by("pk") .values_list("pk", flat=True) .iterator() ) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def download_pacer_pdf_by_rd(self, rd_pk, pacer_case_id, pacer_doc_id, cookies): """Using a RECAPDocument object ID, download the PDF if it doesn't already exist. :param rd_pk: The PK of the RECAPDocument to download :param pacer_case_id: The internal PACER case ID number :param pacer_doc_id: The internal PACER document ID to download :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :return: requests.Response object usually containing a PDF, or None if that wasn't possible. """ rd = RECAPDocument.objects.get(pk=rd_pk) pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) s = PacerSession(cookies=cookies) report = FreeOpinionReport(pacer_court_id, s) try: r = report.download_pdf(pacer_case_id, pacer_doc_id) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning( "Ran into HTTPError while getting PDF: %s. " "Retrying.", exc.response.status_code) if self.request.retries == self.max_retries: self.request.callbacks = None return raise self.retry(exc) else: logger.error( "Ran into unknown HTTPError while getting PDF: %s. " "Aborting.", exc.response.status_code) self.request.callbacks = None return except requests.RequestException as exc: logger.warning("Unable to get PDF for %s in %s", pacer_doc_id, pacer_case_id) if self.request.retries == self.max_retries: self.request.callbacks = None return raise self.retry(exc=exc) return r
def update_any_missing_pacer_case_ids(options): """The network requests were making things far too slow and had to be disabled during the first pass. With this method, we update any items that are missing their pacer case ID value. """ ds = Docket.objects.filter( idb_data__isnull=False, pacer_case_id=None, ) q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, d in enumerate(queryset_generator(ds)): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) params = make_fjc_idb_lookup_params(d.idb_data) chain( get_pacer_case_id_and_title.s( pass_through=d.pk, docket_number=d.idb_data.docket_number, court_id=d.idb_data.district_id, cookies=session.cookies, **params ).set(queue=q), update_docket_from_hidden_api.s().set(queue=q), ).apply_async()
def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: """Update the docket info from iquery :param self: The Celery task :param d_pk: The ID of the docket :param court_id: The court of the docket. Needed for throttling by court. :return: None """ cookies = get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = PacerSession( cookies=cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) d = Docket.objects.get(pk=d_pk, court_id=court_id) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) try: report.query(d.pacer_case_id) except (requests.Timeout, requests.RequestException) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." ) if self.request.retries == self.max_retries: return raise self.retry(exc=exc) if not report.data: return save_iquery_to_docket( self, report.data, d, tag_names=None, add_to_solr=True, )
def handle(self, *args, **options): super(Command, self).handle(*args, **options) recipients = options["recipients"].split(",") print("Recipients list is: %s" % recipients) s = PacerSession( username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD ) s.login() report = CaseQueryAdvancedBankruptcy("canb", s) t1 = now() while True: query = "Pacific" report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) query = "PG&E" report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) time.sleep(options["sleep"]) t2 = now() min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") s.login() t1 = now()
def get_dockets(options): """Download the dockets described in the CSV """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row['filecy'], row['docket']), court_id='ilnb', cookies=pacer_session.cookies, office_number=row['office'], docket_number_letters='bk', ).set(queue=q), get_docket_by_pacer_case_id.s( court_id='ilnb', cookies=pacer_session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = RECAPDocument.objects.filter( tags__name=TAG, document_number='1', document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id='', ).order_by('pk').values_list('pk', flat=True).iterator() q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def fetch_docket(self, fq_pk): """Fetch a docket from PACER This mirrors code elsewhere that gets dockets, but manages status as it goes through the process. :param fq_pk: The PK of the RECAP Fetch Queue to update. :return: None """ fq = PacerFetchQueue.objects.get(pk=fq_pk) mark_pq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) cookies = get_pacer_cookie_from_cache(fq.user_id) if cookies is None: msg = ( "Cookie cache expired before task could run for user: %s" % fq.user_id ) mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) court_id = fq.court_id or getattr(fq.docket, "court_id", None) s = PacerSession(cookies=cookies) try: result = fetch_pacer_case_id_and_title(s, fq, court_id) except (requests.RequestException, ReadTimeoutError) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except PacerLoginException as exc: msg = "PacerLoginException while getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None # result can be one of three values: # None --> Sealed or missing case # Empty dict --> Didn't run the pacer_case_id lookup (wasn't needed) # Full dict --> Ran the query, got back results if result is None: msg = "Cannot find case by docket number (perhaps it's sealed?)" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None pacer_case_id = getattr(fq.docket, "pacer_case_id", None) or result.get( "pacer_case_id" ) if not pacer_case_id: msg = "Unable to determine pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None try: result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq,) except (requests.RequestException, ReadTimeoutError) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None msg = "Successfully got and merged docket. Adding to Solr as final step." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return result
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = (row["cl_d_docket_number"] or row["cl_d_docket_number (student)"] or None) if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk, ) continue if options["task"] == "add_extra_tags": # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() f.close()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = f"{PROJECT_TAG_NAME}-{row['id']}" if not row["district_ct"]: chain( get_appellate_docket_by_docket_number.s( docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ "show_docket_entries": False, "show_orig_docket": False, "show_prior_cases": False, "show_associated_cases": False, "show_panel_info": True, "show_party_atty_info": True, "show_caption": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, case_name=row["name"], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries "doc_num_start": 10000, "doc_num_end": 10000, "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id']) if not row['district_ct']: chain( get_appellate_docket_by_docket_number.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ 'show_docket_entries': False, 'show_orig_docket': False, 'show_prior_cases': False, 'show_associated_cases': False, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, case_name=row['name'], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries 'doc_num_start': 10000, 'doc_num_end': 10000, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def get_appellate_docket_by_docket_number(self, docket_number, court_id, cookies, tag_names=None, **kwargs): """Get a docket by docket number, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param docket_number: The docket number of the case. :param court_id: A courtlistener/PACER appellate court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: The tag name that should be stored with the item in the DB, if desired. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ s = PacerSession(cookies=cookies) report = AppellateDocketReport(court_id, s) logging_id = "%s - %s" % (court_id, docket_number) logger.info("Querying docket report %s", logging_id) try: report.query(docket_number, **kwargs) except requests.RequestException as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.callbacks = None return None raise self.retry(exc=e) docket_data = report.data logger.info('Querying and parsing complete for %s', logging_id) if docket_data == {}: logger.info("Unable to find docket: %s", logging_id) self.request.callbacks = None return None try: d = Docket.objects.get( docket_number=docket_number, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is None: d, count = find_docket_object(court_id, docket_number, docket_number) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d, og_info = update_docket_appellate_metadata(d, docket_data) if not d.pacer_case_id: d.pacer_case_id = docket_number if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Save the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def do_case_query_by_pacer_case_id(self, data, court_id, cookies, tag_names=None): """Run a case query (iquery.pl) query on a case and save the data :param data: A dict containing at least the following: { 'pacer_case_id': The internal pacer case ID for the item. } :param court_id: A courtlistener court ID :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names to associate with the docket when saving it in the DB. :return: A dict with the pacer_case_id and docket_pk values. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = CaseQuery(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None report.query(pacer_case_id) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.CASE_REPORT_PAGE) pacer_file.filepath.save( 'case_report.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) logger.info("Created/updated docket: %s" % d) return { 'pacer_case_id': pacer_case_id, 'docket_pk': d.pk, }
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn("Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s( rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() f.close()