def update_docket_metadata(d: Docket, docket_data: Dict[str, Any]) -> Docket: """Update the Docket object with the data from Juriscraper. Works on either docket history report or docket report (appellate or district) results. """ d = update_case_names(d, docket_data["case_name"]) mark_ia_upload_needed(d, save_docket=False) d.docket_number = docket_data["docket_number"] or d.docket_number d.date_filed = docket_data.get("date_filed") or d.date_filed d.date_last_filing = ( docket_data.get("date_last_filing") or d.date_last_filing ) d.date_terminated = docket_data.get("date_terminated") or d.date_terminated d.cause = docket_data.get("cause") or d.cause d.nature_of_suit = docket_data.get("nature_of_suit") or d.nature_of_suit d.jury_demand = docket_data.get("jury_demand") or d.jury_demand d.jurisdiction_type = ( docket_data.get("jurisdiction") or d.jurisdiction_type ) d.mdl_status = docket_data.get("mdl_status") or d.mdl_status judges = get_candidate_judges( docket_data.get("assigned_to_str"), d.court_id, docket_data.get("date_filed"), ) if judges is not None and len(judges) == 1: d.assigned_to = judges[0] d.assigned_to_str = docket_data.get("assigned_to_str") or "" judges = get_candidate_judges( docket_data.get("referred_to_str"), d.court_id, docket_data.get("date_filed"), ) if judges is not None and len(judges) == 1: d.referred_to = judges[0] d.referred_to_str = docket_data.get("referred_to_str") or "" d.blocked, d.date_blocked = get_blocked_status(d) return d
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())