def get_final_docs(options): """Get any documents that contain "final" in their description.""" des = (DocketEntry.objects.filter( tags__name=TAG, description__icontains="final").order_by("pk").iterator()) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, de in enumerate(des): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) rd_pks = (de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id="").order_by("pk").values_list("pk", flat=True)) for rd_pk in rd_pks: throttle.maybe_wait() chain( get_pacer_doc_by_rd.s(rd_pk, pacer_session.cookies, tag=TAG_FINALS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def get_district_attachment_pages(options): """Get the attachment page information for all of the items on the dockets :param options: The options returned by argparse. :type options: dict """ q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list('pk', flat=True) for i, rd_pk in enumerate(rd_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 100 == 0: logger.info("Doing item %s: %s", i, rd_pk) throttle.maybe_wait() chain( get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q), make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q), process_recap_attachment.s(tag_names=[TAG]).set(queue=q), ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False, 'doc_num_end': doc_num_end, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def add_all_nysd_to_cl(options): """Alas, there's only one way to get all the cases about a particular judge: Get all the cases in the entire jurisdiction. We do that here using the iquery.pl endpoint. Once added to the DB we'll ensure they're tagged. In the next step, we'll download all the tagged items. """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # IDs obtained by binary search of docket numbers on PACER website. earliest_id = 405990 latest_id = 543051 for pacer_case_id in range(earliest_id, latest_id): if pacer_case_id < options["skip_until"]: continue if pacer_case_id >= options["limit"] > 0: break if pacer_case_id % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Doing pacer_case_id: %s", pacer_case_id) make_docket_by_iquery.apply_async( args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]), queue=q, )
def get_data(options, row_transform, tags): """Download dockets from a csv, then download claims register data from those dockets. :param options: The options provided at the command line. :param row_transform: A function that takes the row as an argument and returns a cleaned up version of the row that has the needed attributes. This parameter allows this function to be able to work with almost any CSV. :param tags: Tags you wish to apply to the gathered data. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break # All tests pass. Get the docket. row = row_transform(row) logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() get_docket_and_claims( row['docket_number'], row['court'], row['case_name'], session.cookies, tags, q, )
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by("?")[:sample_size] q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params, ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, "doc_num_end": doc_num_end, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_attachment_pages(options, tag): rd_pks = RECAPDocument.objects.filter( tags__name=tag, docket_entry__description__icontains='attachment', ).values_list('pk', flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages(options=options, rd_pks=rd_pks, tag_names=[tag], session=session)
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query) si.conn.http_connection.close() q = options["queue"] recap_user = User.objects.get(username="******") throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break logger.info( "Doing row %s: rd: %s, docket: %s", i, result["id"], result["docket_id"], ) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s(result["id"], session.cookies).set( queue=q ), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set( queue=q ), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set( queue=q ), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False, 'highlight': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'], result['docket_id']) try: rd = RECAPDocument.objects.get(pk=result['id']) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result['id']) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def log_into_pacer(username, password): """Log into PACER and return the cookie jar :param username: A PACER username :param password: A PACER password :return: Request.CookieJar """ s = PacerSession(username=username, password=password) s.login() return s.cookies
def get_dockets(options, items, tags, sample_size=0): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options["input_file"], "r") as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] task = options["task"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row["idb_docket_number"]: if task == "download_student_dockets": continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row["idb_docket_number"].rjust(7, "0") elif row["student_docket_number"]: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row["student_docket_number"] else: # No docket number; move on. continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row["Case Name"], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_att_pages(options): rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list('pk', flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages(options=options, rd_pks=rd_pks, tag_names=[TAG], session=session)
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options['input_file'], 'r') as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row['idb_docket_number']: if task == 'download_student_dockets': continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row['idb_docket_number'].rjust(7, '0') elif row['student_docket_number']: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row['student_docket_number'] else: # No docket number; move on. continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row['Case Name'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_att_pages(options): rd_pks = RECAPDocument.objects.filter( tags__name=TAG, docket_entry__docket__court__jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], ).values_list("pk", flat=True) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[TAG], session=session )
def get_dockets(options): """Download a sample of dockets from PACER matching the 7xx series of NOS codes. """ nos_codes = [ LABOR_LITIGATION_OTHER, LABOR_MANAGEMENT_RELATIONS_ACT, LABOR_MANAGEMENT_REPORT_DISCLOSURE, FAIR_LABOR_STANDARDS_ACT_CV, RAILWAY_LABOR_ACT, FAMILY_AND_MEDICAL_LEAVE_ACT, EMPLOYEE_RETIREMENT_INCOME_SECURITY_ACT ] sample_size = 300 items = FjcIntegratedDatabase.objects.filter( nature_of_suit__in=nos_codes, date_terminated__gt='2009-01-01', date_terminated__lt='2018-10-15', date_filed__gt='2009-01-01').order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) logger.info("This case is from year: %s", row.date_filed.year) throttle.maybe_wait() case_name = '%s v. %s' % (row.plaintiff, row.defendant) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, case_name=case_name, ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row.district_id, cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def get_dockets(options): """Get the dockets by the particular judge now that we have run iquery for all of the cases in the jurisdiction, and now that we have """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() buchwald_id = 450 ds = ( Docket.objects.filter( court_id="nysd", assigned_to_id=buchwald_id, tags__name=NYSD_TAG ) .exclude(idb_data__nature_of_suit__in=NOS_EXCLUSIONS) .exclude(idb_data__isnull=True) ) logger.info("Got %s dockets to download", ds.count()) for i, d in enumerate(ds): if i < options["skip_until"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() throttle.maybe_wait() logger.info("%s: Doing docket with pk: %s", i, d.pk) chain( get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, cookies=session.cookies, docket_pk=d.pk, tag_names=[BUCKWALD_TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV """ f = options["file"] reader = csv.DictReader(f) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row["filecy"], row["docket"]), court_id="ilnb", cookies=pacer_session.cookies, office_number=row["office"], docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id="ilnb", cookies=pacer_session.cookies, tag_names=[TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) recipients = options["recipients"].split(",") print("Recipients list is: %s" % recipients) s = PacerSession( username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD ) s.login() report = CaseQueryAdvancedBankruptcy("canb", s) t1 = now() while True: query = "Pacific" report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) query = "PG&E" report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) time.sleep(options["sleep"]) t2 = now() min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") s.login() t1 = now()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) recipients = options['recipients'].split(',') print("Recipients list is: %s" % recipients) s = PacerSession(username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD) s.login() report = CaseQueryAdvancedBankruptcy('canb', s) t1 = now() while True: query = 'Pacific' report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) query = 'PG&E' report.query( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), ) num_results = len(report.data) print("Checked '%s' and got %s results" % (query, num_results)) if num_results > 0: print("Sending emails and exiting!") send_emails(report, recipients) exit(0) time.sleep(options['sleep']) t2 = now() min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") s.login() t1 = now()
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = ( RECAPDocument.objects.filter( tags__name=TAG, document_number="1", document_type=RECAPDocument.PACER_DOCUMENT, ) .exclude(pacer_doc_id="",) .order_by("pk") .values_list("pk", flat=True) .iterator() ) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row['filecy'], row['docket']), court_id='ilnb', cookies=pacer_session.cookies, office_number=row['office'], docket_number_letters='bk', ).set(queue=q), get_docket_by_pacer_case_id.s( court_id='ilnb', cookies=pacer_session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def update_any_missing_pacer_case_ids(options): """The network requests were making things far too slow and had to be disabled during the first pass. With this method, we update any items that are missing their pacer case ID value. """ ds = Docket.objects.filter( idb_data__isnull=False, pacer_case_id=None, ) q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, d in enumerate(queryset_generator(ds)): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) params = make_fjc_idb_lookup_params(d.idb_data) chain( get_pacer_case_id_and_title.s( pass_through=d.pk, docket_number=d.idb_data.docket_number, court_id=d.idb_data.district_id, cookies=session.cookies, **params ).set(queue=q), update_docket_from_hidden_api.s().set(queue=q), ).apply_async()
def get_petitions(options): """Just get document number one for every docket that's tagged in this collection. """ rds = RECAPDocument.objects.filter( tags__name=TAG, document_number='1', document_type=RECAPDocument.PACER_DOCUMENT, ).exclude( pacer_doc_id='', ).order_by('pk').values_list('pk', flat=True).iterator() q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, rd_pk in enumerate(rds): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( rd_pk, pacer_session.cookies, tag=TAG_PETITIONS).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = f"{PROJECT_TAG_NAME}-{row['id']}" if not row["district_ct"]: chain( get_appellate_docket_by_docket_number.s( docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ "show_docket_entries": False, "show_orig_docket": False, "show_prior_cases": False, "show_associated_cases": False, "show_panel_info": True, "show_party_atty_info": True, "show_caption": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, case_name=row["name"], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries "doc_num_start": 10000, "doc_num_end": 10000, "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn("Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s( rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() f.close()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id']) if not row['district_ct']: chain( get_appellate_docket_by_docket_number.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ 'show_docket_entries': False, 'show_orig_docket': False, 'show_prior_cases': False, 'show_associated_cases': False, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, case_name=row['name'], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries 'doc_num_start': 10000, 'doc_num_end': 10000, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = (row["cl_d_docket_number"] or row["cl_d_docket_number (student)"] or None) if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk, ) continue if options["task"] == "add_extra_tags": # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() f.close()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id']) if not row['district_ct']: chain( get_appellate_docket_by_docket_number.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ 'show_docket_entries': False, 'show_orig_docket': False, 'show_prior_cases': False, 'show_associated_cases': False, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, case_name=row['name'], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries 'doc_num_start': 10000, 'doc_num_end': 10000, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()