def parse_items(self): """For every item in the directory, send it to Celery for processing""" docket_paths = get_docket_list() completed = 0 for docket_path in docket_paths: if completed < self.options['start_item'] - 1: # Skip ahead if start_lines is provided. completed += 1 continue else: logger.info("%s: Parsing docket: %s" % (completed, docket_path)) pacer_doc = PacerXMLParser(docket_path) required_fields = ['case_name', 'date_filed'] for field in required_fields: if not getattr(pacer_doc, field): logger.error("Missing required field: %s" % field) continue docket = lookup_and_save(pacer_doc, self.debug) if docket is not None: pacer_doc.make_documents(docket, self.debug) pacer_doc.make_parties(docket, self.debug) completed += 1 max_items = self.options['max_items'] if completed >= max_items != -1: logger.info("\n\nCompleted %s items. Aborting early." % max_items) break
def down_for_only_me(session, url): """Check if a URL is down just our server, or globally :return: True if the url is only down for me, or False if entirely up or entirely down. """ try: check_and_log_url(session, url) except requests.RequestException: # Down from our server. Try from our proxy. try: proxy_response = check_if_global_outage(session, url) except requests.RequestException as e: logger.error("Problem hitting proxy: %s", e) raise e j = proxy_response.json() if j["status_code"] is not None: # Down from our server, but up from our proxy. Yikes! return True else: # Down from our server, and down from our proxy. OK. return False # Up from our server. OK. return False
def sample_dockets(self): """Iterate over `node_count` items and extract the value at the XPath. If there are not `node_count` recap dockets on disk, do the lesser of the two. """ docket_paths = get_docket_list(self.options['path']) random.shuffle(docket_paths) completed = 0 no_value = 0 errors = 0 c = Counter() for docket_path in docket_paths: with open(docket_path, 'r') as f: docket_xml_content = f.read() if not docket_xml_content: continue # Extract the xpath value try: tree = etree.fromstring(docket_xml_content) except XMLSyntaxError: errors += 1 continue try: values = tree.xpath(self.options['xpath']) except XPathEvalError: logger.error("Invalid XPath expression.") exit(1) if values: logger.info("%s: %s" % (completed, values)) c.update([str(v) for v in values]) completed += 1 else: no_value += 1 if completed == self.options['sample_size']: break with open('sample.pkl', 'wb') as f: pickle.dump(c, f) logger.info('\n%s items had no value. %s errors. Sample saved at ' '"sample.pkl"' % (no_value, errors))
def iterate_and_log_courts(courts): session = requests.Session() for court in courts: url = make_simple_url(court) logger.info("Checking url for %s: %s", court.pk, url) t1 = now() max_tries = 3 try_number = 1 while try_number <= max_tries: down_for_me = down_for_only_me(session, url) if not down_for_me: break try_number += 1 else: # Tried `try_count` times, and it was always down just for me. Oof. # Use % instead of logging params to bypass Sentry issue grouping logger.error( "After %s seconds and %s tries, failed to access %s's PACER " "website from our server, but got it via our proxy each time." % ((now() - t1).seconds, try_number, court.pk))
def add_directory(options): """Import JSON files from a directory provided at the command line. Use glob.globs' to identify JSON files to import. :return: None """ dir_glob = options["directory_glob"] skip_until = options["skip_until"] if dir_glob is None: print("--directory-glob is a required parameter when the " "'add-directory' action is selected.") else: dir_glob = options["directory_glob"] fps = sorted(glob(dir_glob)) if skip_until: # Remove items from the list until the skip_until value is hit. try: skip_index = fps.index(skip_until) fps = fps[skip_index:] except ValueError: logger.error( "Unable to find '%s' in directory_glob: '%s'. " "The first few items of the glob look like: \n " "%s", skip_until, dir_glob, "\n ".join(fps[0:3]), ) raise q = options["queue"] throttle = CeleryThrottle(queue_name=q) for fp in fps: throttle.maybe_wait() logger.info("Adding LASC JSON file at: %s", fp) tasks.add_case_from_filepath.apply_async(kwargs={"filepath": fp}, queue=q)
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. """ # Kill any *old* logs that report they're in progress. (They've failed.) twelve_hrs_ago = now() - timedelta(hours=12) PACERFreeDocumentLog.objects.filter( date_started__lt=twelve_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = Court.objects.filter( jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY], in_use=True, end_date=None, ).exclude(pk__in=[ 'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb', 'prb', 'tnwb', 'vib' ], ).values_list( 'pk', flat=True, ) pacer_court_ids = { map_cl_to_pacer_id(v): { 'until': now(), 'count': 1, 'result': None } for v in cl_court_ids } pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() # Iterate over every court, X days at a time. As courts are completed, # remove them from the list of courts to process until none are left tomorrow = now() + timedelta(days=1) while len(pacer_court_ids) > 0: court_ids_copy = pacer_court_ids.copy() # Make a copy of the list. for pacer_court_id, delay in court_ids_copy.items(): if now() < delay['until']: # Do other courts until the delay is up. Do not print/log # anything since at the end there will only be one court left. continue next_start_date, next_end_date = get_next_date_range( pacer_court_id) if delay['result'] is not None: if delay['result'].ready(): result = delay['result'].get() if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_start_date >= tomorrow.date(): logger.info("Finished '%s'. Marking it complete." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue else: next_delay = min(delay['count'] * 5, 30) # backoff w/cap logger.info( "Court %s still in progress. Delaying at least " "%ss." % (pacer_court_id, next_delay)) pacer_court_ids[pacer_court_id]['until'] = now( ) + timedelta(seconds=next_delay) pacer_court_ids[pacer_court_id]['count'] += 1 continue mark_court_in_progress(pacer_court_id, next_end_date) pacer_court_ids[pacer_court_id]['count'] = 1 # Reset delay['result'] = chain( get_and_save_free_document_report.si(pacer_court_id, next_start_date, next_end_date, pacer_session), mark_court_done_on_date.s(pacer_court_id, next_end_date), ).apply_async()
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. For now just get the list. Note that this uses synchronous celery chains. A previous version was more complex and did not use synchronous chains. Unfortunately in Celery 4.2.0, or more accurately in redis-py 3.x.x, doing it that way failed nearly every time. This is a simpler version, though a slower one, but it should get the job done. """ # Kill any *old* logs that report they're in progress. (They've failed.) three_hrs_ago = now() - timedelta(hours=3) PACERFreeDocumentLog.objects.filter( date_started__lt=three_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = (Court.federal_courts.district_pacer_courts().filter( in_use=True, end_date=None, ).exclude(pk__in=["casb", "gub", "innb", "miwb", "ohsb", "prb"], ).values_list( "pk", flat=True, )) pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids] today = now() for pacer_court_id in pacer_court_ids: while True: next_start_d, next_end_d = get_next_date_range(pacer_court_id) logger.info( "Attempting to get latest document references for " "%s between %s and %s", pacer_court_id, next_start_d, next_end_d, ) mark_court_in_progress(pacer_court_id, next_end_d) try: status = get_and_save_free_document_report( pacer_court_id, next_start_d, next_end_d, ) except RequestException: logger.error( "Failed to get document references for %s " "between %s and %s due to network error.", pacer_court_id, next_start_d, next_end_d, ) mark_court_done_on_date( PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d, ) break except IndexError: logger.error( "Failed to get document references for %s " "between %s and %s due to PACER 6.3 bug.", pacer_court_id, next_start_d, next_end_d, ) mark_court_done_on_date( PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d, ) break else: result = mark_court_done_on_date(status, pacer_court_id, next_end_d) if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_end_d >= today.date(): logger.info("Got all document references for '%s'.", pacer_court_id) # Break from while loop, onwards to next court break else: # More dates to do; let it continue continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) # Break from while loop, onwards to next court break
def parse_harvard_opinions(reporter, volume): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( list(set(itertools.chain.from_iterable(judge_list + author_list))) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) # Iterate over other xml fields in Harvard data set # and save as string list for further processing at a later date. json_fields = [ "attorneys", "disposition", "syllabus", "summary", "history", "otherdate", "seealso", "headnotes", "correction", ] data_set = {} while json_fields: key = json_fields.pop(0) data_set[key] = "|".join([x.text for x in soup.find_all(key)]) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=data_set["attorneys"], disposition=data_set["disposition"], syllabus=data_set["syllabus"], summary=data_set["summary"], history=data_set["history"], other_dates=data_set["otherdate"], cross_reference=data_set["seealso"], headnotes=data_set["headnotes"], correction=data_set["correction"], judges=judges, filepath_json_harvard=file_path, ) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.reporter][0]["cite_type"] ), cluster_id=cluster.id, ) for op in soup.find_all("opinion"): joined_by_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(judge_list))) ) ) author_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(author_list))) ) ) op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) Opinion.objects.create( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, joined_by_str=joined_by_str, extracted_by_ocr=True, ) logger.info("Finished: %s", citation.base_citation())
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. """ # Kill any *old* logs that report they're in progress. (They've failed.) three_hrs_ago = now() - timedelta(hours=3) PACERFreeDocumentLog.objects.filter( date_started__lt=three_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update( status=PACERFreeDocumentLog.SCRAPE_FAILED, ) cl_court_ids = Court.objects.filter( jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY], in_use=True, end_date=None, ).exclude( pk__in=['casb', 'gub', 'innb', 'miwb', 'ohsb', 'prb'], ).values_list( 'pk', flat=True, ) pacer_court_ids = { map_cl_to_pacer_id(v): {'until': now(), 'count': 1, 'result': None} for v in cl_court_ids } pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() # Iterate over every court, X days at a time. As courts are completed, # remove them from the list of courts to process until none are left today = now() max_delay_count = 20 while len(pacer_court_ids) > 0: court_ids_copy = pacer_court_ids.copy() # Make a copy of the list. for pacer_court_id, delay in court_ids_copy.items(): if now() < delay['until']: # Do other courts until the delay is up. Do not print/log # anything since at the end there will only be one court left. continue next_start_d, next_end_d = get_next_date_range(pacer_court_id) if delay['result'] is not None: if delay['result'].ready(): result = delay['result'].get() if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_end_d >= today.date(): logger.info("Finished '%s'. Marking it complete." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue else: if delay['count'] > max_delay_count: logger.error("Something went wrong and we weren't " "able to finish %s. We ran out of time." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue next_delay = min(delay['count'] * 5, 30) # backoff w/cap logger.info("Court %s still in progress. Delaying at " "least %ss." % (pacer_court_id, next_delay)) delay_until = now() + timedelta(seconds=next_delay) pacer_court_ids[pacer_court_id]['until'] = delay_until pacer_court_ids[pacer_court_id]['count'] += 1 continue mark_court_in_progress(pacer_court_id, next_end_d) pacer_court_ids[pacer_court_id]['count'] = 1 # Reset delay['result'] = chain( get_and_save_free_document_report.si( pacer_court_id, next_start_d, next_end_d, pacer_session.cookies, ), mark_court_done_on_date.s(pacer_court_id, next_end_d), ).apply_async()