def get_pacer_dockets(options, docket_pks, tags): """Get the pacer dockets identified by the FJC IDB rows""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info(f"Sent {i} tasks to celery so far.") d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s( {"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk}, d.court_id, cookies=pacer_session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_pacer_dockets(options, docket_pks, tags): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s( {'pacer_case_id': d.pacer_case_id, 'docket_pk': d.pk}, d.court_id, cookies=pacer_session.cookies, tag_names=tags, **{'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False} ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_pacer_dockets(options, docket_pks, tag): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s({ 'pacer_case_id': d.pacer_case_id }, d.court_id, cookies=pacer_session.cookies, **{ 'tag_names': [tag], 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_docket_and_claims(docket_number, court, case_name, cookies, tags, q): """Get the docket report, claims history report, and save it all to the DB and Solr """ chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court, cookies=cookies, case_name=case_name, docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=court, cookies=cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, }).set(queue=q), get_bankr_claims_registry.s( cookies=cookies, tag_names=tags, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=''): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False, 'doc_num_end': doc_num_end, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_pacer_dockets(options, row_pks, tag=None): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, row_pk in enumerate(row_pks): if i >= options['count'] > 0: break throttle.maybe_wait() if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) row = FjcIntegratedDatabase.objects.get(pk=row_pk) chain( get_docket_by_pacer_case_id.s( row.pacer_case_id, row.district_id, pacer_session, **{'tag': tag, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True} ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. :param doc_num_end: Only get docket numbers up to this value to constrain costs. If set to an empty string, no constraints are applied. Note that applying this value means no unnumbered entries will be retrieved by PACER. """ if sample_size > 0: items = items.order_by("?")[:sample_size] q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params, ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, "doc_num_end": doc_num_end, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options, items, tags, sample_size=0): """Download dockets from PACER. :param options: Options provided by argparse :param items: Items from our FJC IDB database :param tags: A list of tag names to associate with the purchased content. :param sample_size: The number of items to get. If 0, get them all. Else, get only this many and do it randomly. """ if sample_size > 0: items = items.order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, **params ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, cookies=session.cookies, tag_names=tags, **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options["input_file"], "r") as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] task = options["task"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row["idb_docket_number"]: if task == "download_student_dockets": continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row["idb_docket_number"].rjust(7, "0") elif row["student_docket_number"]: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row["student_docket_number"] else: # No docket number; move on. continue court = Court.objects.get( fjc_court_id=row["AO ID"].rjust(2, "0"), jurisdiction=Court.FEDERAL_DISTRICT, ) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row["Case Name"], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" with open(options['input_file'], 'r') as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) if row['idb_docket_number']: if task == 'download_student_dockets': continue # Zero-pad the docket number up to seven digits because Excel # ate the leading zeros that these would normally have. docket_number = row['idb_docket_number'].rjust(7, '0') elif row['student_docket_number']: # Use the values collected by student # researchers, then cleaned up my mlr. docket_number = row['student_docket_number'] else: # No docket number; move on. continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=docket_number, court_id=court.pk, cookies=session.cookies, case_name=row['Case Name'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court.pk, cookies=session.cookies, tag_names=[TAG_NAME], ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download a sample of dockets from PACER matching the 7xx series of NOS codes. """ nos_codes = [ LABOR_LITIGATION_OTHER, LABOR_MANAGEMENT_RELATIONS_ACT, LABOR_MANAGEMENT_REPORT_DISCLOSURE, FAIR_LABOR_STANDARDS_ACT_CV, RAILWAY_LABOR_ACT, FAMILY_AND_MEDICAL_LEAVE_ACT, EMPLOYEE_RETIREMENT_INCOME_SECURITY_ACT ] sample_size = 300 items = FjcIntegratedDatabase.objects.filter( nature_of_suit__in=nos_codes, date_terminated__gt='2009-01-01', date_terminated__lt='2018-10-15', date_filed__gt='2009-01-01').order_by('?')[:sample_size] q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(items): if i < options['offset']: continue if i >= options['limit'] > 0: break # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) logger.info("This case is from year: %s", row.date_filed.year) throttle.maybe_wait() case_name = '%s v. %s' % (row.plaintiff, row.defendant) chain( get_pacer_case_id_and_title.s( docket_number=row.docket_number, court_id=row.district_id, cookies=session.cookies, case_name=case_name, ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row.district_id, cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Get the dockets by the particular judge now that we have run iquery for all of the cases in the jurisdiction, and now that we have """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() buchwald_id = 450 ds = ( Docket.objects.filter( court_id="nysd", assigned_to_id=buchwald_id, tags__name=NYSD_TAG ) .exclude(idb_data__nature_of_suit__in=NOS_EXCLUSIONS) .exclude(idb_data__isnull=True) ) logger.info("Got %s dockets to download", ds.count()) for i, d in enumerate(ds): if i < options["skip_until"]: continue if i >= options["limit"] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() throttle.maybe_wait() logger.info("%s: Doing docket with pk: %s", i, d.pk) chain( get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, cookies=session.cookies, docket_pk=d.pk, tag_names=[BUCKWALD_TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": False, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV """ f = options["file"] reader = csv.DictReader(f) q = options["queue"] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 1000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row["filecy"], row["docket"]), court_id="ilnb", cookies=pacer_session.cookies, office_number=row["office"], docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id="ilnb", cookies=pacer_session.cookies, tag_names=[TAG], **{ "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def get_dockets(options): """Download the dockets described in the CSV """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 1000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) logger.info("Doing row %s", i) throttle.maybe_wait() chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=make_docket_number(row['filecy'], row['docket']), court_id='ilnb', cookies=pacer_session.cookies, office_number=row['office'], docket_number_letters='bk', ).set(queue=q), get_docket_by_pacer_case_id.s( court_id='ilnb', cookies=pacer_session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id']) if not row['district_ct']: chain( get_appellate_docket_by_docket_number.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ 'show_docket_entries': False, 'show_orig_docket': False, 'show_prior_cases': False, 'show_associated_cases': False, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, case_name=row['name'], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries 'doc_num_start': 10000, 'doc_num_end': 10000, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = f"{PROJECT_TAG_NAME}-{row['id']}" if not row["district_ct"]: chain( get_appellate_docket_by_docket_number.s( docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ "show_docket_entries": False, "show_orig_docket": False, "show_prior_cases": False, "show_associated_cases": False, "show_panel_info": True, "show_party_atty_info": True, "show_caption": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row["docket_no1"], court_id=row["cl_court"], cookies=session.cookies, case_name=row["name"], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row["cl_court"], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries "doc_num_start": 10000, "doc_num_end": 10000, "show_parties_and_counsel": True, "show_terminated_parties": True, "show_list_of_member_cases": True, }, ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ c = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id court_id = fq.court_id or getattr(fq.docket, "court_id", None) kwargs = { # Universal params "court_id": court_id, "user_pk": fq.user_id, "docket_pk": fq.docket_id, # Scraping params "doc_num_start": fq.de_number_start, "doc_num_end": fq.de_number_end, "date_start": fq.de_date_start, "date_end": fq.de_date_end, "show_parties_and_counsel": fq.show_parties_and_counsel, "show_terminated_parties": fq.show_terminated_parties, "show_list_of_member_cases": fq.show_list_of_member_cases, } if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number: # We lack the pacer_case_id either on the docket or from the # submission. Look it up. docket_number = fq.docket_number or getattr( fq.docket, "docket_number", None ) c = chain( get_pacer_case_id_and_title.si( pass_through=None, docket_number=docket_number, court_id=court_id, user_pk=fq.user_id, ), get_docket_by_pacer_case_id.s(**kwargs), ) else: if fq.docket_id is not None and fq.docket.pacer_case_id: # We have the docket and its pacer_case_id kwargs.update( { "data": {"pacer_case_id": fq.docket.pacer_case_id}, "court_id": fq.docket.court_id, } ) elif fq.pacer_case_id: # We lack the docket, but have a pacer_case_id kwargs.update( {"data": {"pacer_case_id": fq.pacer_case_id},} ) c = chain(get_docket_by_pacer_case_id.si(**kwargs)) c |= add_or_update_recap_docket.s() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id if fq.recap_document_id: c = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk, fq.user_id), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), ) if c is not None: c |= mark_fq_successful.si(fq.pk) c.apply_async() else: # Somehow failed to make a chain. Log an error. fq.status = PROCESSING_STATUS.INVALID_CONTENT fq.message = "Invalid submission, unable to make chain for processing." fq.save()
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s(court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True }).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def do_pacer_fetch(fq): """Process a request made by a user to get an item from PACER. :param fq: The PacerFetchQueue item to process :return: None """ result = None if fq.request_type == REQUEST_TYPE.DOCKET: # Request by docket_id court_id = fq.court_id or getattr(fq.docket, "court_id", None) kwargs = { # Universal params "court_id": court_id, "user_pk": fq.user_id, "docket_pk": fq.docket_id, # Scraping params "doc_num_start": fq.de_number_start, "doc_num_end": fq.de_number_end, "date_start": fq.de_date_start, "date_end": fq.de_date_end, "show_parties_and_counsel": fq.show_parties_and_counsel, "show_terminated_parties": fq.show_terminated_parties, "show_list_of_member_cases": fq.show_list_of_member_cases, } if (fq.docket_id and not fq.docket.pacer_case_id) or fq.docket_number: # We lack the pacer_case_id either on the docket or from the # submission. Look it up. docket_number = fq.docket_number or getattr( fq.docket, "docket_number", None) c = chain( get_pacer_case_id_and_title.si( pass_through=None, docket_number=docket_number, court_id=court_id, user_pk=fq.user_id, ), get_docket_by_pacer_case_id.s(**kwargs), ) else: if fq.docket_id is not None and fq.docket.pacer_case_id: # We have the docket and its pacer_case_id kwargs.update({ "data": { "pacer_case_id": fq.docket.pacer_case_id }, "court_id": fq.docket.court_id, }) elif fq.pacer_case_id: # We lack the docket, but have a pacer_case_id kwargs.update({ "data": { "pacer_case_id": fq.pacer_case_id }, }) c = chain(get_docket_by_pacer_case_id.si(**kwargs)) c |= add_or_update_recap_docket.s() c |= mark_fq_successful.si(fq.pk) result = c.apply_async() elif fq.request_type == REQUEST_TYPE.PDF: # Request by recap_document_id rd_pk = fq.recap_document_id result = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk), extract_recap_pdf.si(rd_pk), add_items_to_solr.si([rd_pk], "search.RECAPDocument"), mark_fq_successful.si(fq.pk), ).apply_async() elif fq.request_type == REQUEST_TYPE.ATTACHMENT_PAGE: result = fetch_attachment_page.apply_async(args=(fq.pk, )) return result
def get_dockets(options): """Download the dockets described in the CSV according to the `tasks` option. """ f = options['file'] reader = csv.DictReader(f) q = options['queue'] task = options['task'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break if row['Too Old'] == 'Yes': continue if row['Appellate/District'].lower() != task: # Only do appellate when appellate, and district when district. continue # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() if task == 'appellate': chain( get_appellate_docket_by_docket_number.s( docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_docket_entries': True, 'show_orig_docket': True, 'show_prior_cases': True, 'show_associated_cases': True, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == 'district': chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['Cleaned case_No'], court_id=row['fjc_court_id'], cookies=session.cookies, case_name=row['Title'], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['fjc_court_id'], cookies=session.cookies, tag_names=[TAG], **{ 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def download_dockets(options): """Download dockets listed in the spreadsheet.""" f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(2048)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) row_tag = '%s-%s' % (PROJECT_TAG_NAME, row['id']) if not row['district_ct']: chain( get_appellate_docket_by_docket_number.s( docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, # we'll handle that later. **{ 'show_docket_entries': False, 'show_orig_docket': False, 'show_prior_cases': False, 'show_associated_cases': False, 'show_panel_info': True, 'show_party_atty_info': True, 'show_caption': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row['docket_no1'], court_id=row['cl_court'], cookies=session.cookies, case_name=row['name'], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row['cl_court'], cookies=session.cookies, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries 'doc_num_start': 10000, 'doc_num_end': 10000, 'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': True, } ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close()