def test_get_appellate_court_object_from_string(self) -> None: """Can we get the correct federal appellate courts?""" pairs = ( { "q": "U. S. Court of Appeals for the Ninth Circuit", "a": "ca9" }, { # FJC data does not appear to have a space between U. and S. "q": "U.S. Court of Appeals for the Ninth Circuit", "a": "ca9", }, { "q": "U. S. Circuit Court for the Ninth Circuit", "a": "ca9" }, { "q": "U.S. Circuit Court for the Ninth Circuit", "a": "ca9" }, ) for test in pairs: print(f"Testing: {test['q']}, expecting: {test['a']}") got = match_court_string(test["q"], federal_appeals=True) self.assertEqual(test["a"], got)
def test_get_appellate_court_object_from_string(self): """Can we get the correct federal appellate courts?""" pairs = ( { 'q': 'U. S. Court of Appeals for the Ninth Circuit', 'a': 'ca9', }, { # FJC data does not appear to have a space between U. and S. 'q': 'U.S. Court of Appeals for the Ninth Circuit', 'a': 'ca9', }, { 'q': 'U. S. Circuit Court for the Ninth Circuit', 'a': 'ca9', }, { 'q': 'U.S. Circuit Court for the Ninth Circuit', 'a': 'ca9', }, ) for test in pairs: print("Testing: %s, expecting: %s" % (test['q'], test['a'])) got = match_court_string(test['q'], federal_appeals=True) self.assertEqual(test['a'], got)
def update_bankruptcy_and_magistrate(testing=False): # update bankruptcy positions positions = Position.object.filter(job_title__icontains='Bankruptcy') for position in positions: location = position.location bcourt = match_court_string(location, bankruptcy=True) if bcourt is None: continue position.court_id = bcourt position.position_type = 'jud' if not testing: position.save() positions = Position.object.filter(job_title__icontains='Magistrate') for position in positions: location = position.location mcourt = match_court_string(location, federal_district=True) position.court_id = mcourt position.position_type = 'm-jud' if not testing: position.save()
def update_bankruptcy_and_magistrate(testing=False): # update bankruptcy positions positions = Position.object.filter(job_title__icontains='Bankruptcy') for position in positions: location = position.location bcourt = match_court_string(location, bankruptcy=True) if bcourt is None: continue position.court_id = bcourt position.position_type = 'jud' if not testing: position.save() positions = Position.object.filter(job_title__icontains='Magistrate') for position in positions: location = position.location mcourt = match_court_string(location, federal_district=True) position.court_id = mcourt position.position_type = 'm-jud' if not testing: position.save()
def test_get_fed_court_object_from_string(self): """Can we get the correct federal courts?""" pairs = ( { 'q': 'Eastern District of New York', 'a': 'nyed' }, { 'q': 'Northern District of New York', 'a': 'nynd' }, { 'q': 'Southern District of New York', 'a': 'nysd' }, # When we have unknown first word, we assume it's errant. { 'q': 'Nathan District of New York', 'a': 'nyd' }, { 'q': "Nate District of New York", 'a': 'nyd', }, { 'q': "Middle District of Pennsylvania", 'a': 'pamd', }, { 'q': "Middle Dist. of Pennsylvania", 'a': 'pamd', }, { 'q': "M.D. of Pennsylvania", 'a': 'pamd', } ) for test in pairs: print("Testing: %s, expecting: %s" % (test['q'], test['a'])) got = match_court_string(test['q'], federal_district=True) self.assertEqual( test['a'], got, )
def test_get_fed_court_object_from_string(self): """Can we get the correct federal courts?""" pairs = ( { "q": "Eastern District of New York", "a": "nyed" }, { "q": "Northern District of New York", "a": "nynd" }, { "q": "Southern District of New York", "a": "nysd" }, # When we have unknown first word, we assume it's errant. { "q": "Nathan District of New York", "a": "nyd" }, { "q": "Nate District of New York", "a": "nyd", }, { "q": "Middle District of Pennsylvania", "a": "pamd", }, { "q": "Middle Dist. of Pennsylvania", "a": "pamd", }, { "q": "M.D. of Pennsylvania", "a": "pamd", }, ) for test in pairs: print("Testing: %s, expecting: %s" % (test["q"], test["a"])) got = match_court_string(test["q"], federal_district=True) self.assertEqual( test["a"], got, )
def test_get_fed_court_object_from_string(self): """Can we get the correct federal courts?""" pairs = ( { 'q': 'Eastern District of New York', 'a': 'nyed' }, { 'q': 'Northern District of New York', 'a': 'nynd' }, { 'q': 'Southern District of New York', 'a': 'nysd' }, # When we have unknown first word, we assume it's errant. { 'q': 'Nathan District of New York', 'a': 'nyd' }, { 'q': "Nate District of New York", 'a': 'nyd', }, { 'q': "Middle District of Pennsylvania", 'a': 'pamd', }, { 'q': "Middle Dist. of Pennsylvania", 'a': 'pamd', }, { 'q': "M.D. of Pennsylvania", 'a': 'pamd', } ) for test in pairs: print("Testing: %s, expecting: %s" % (test['q'], test['a'])) got = match_court_string(test['q'], federal_district=True) self.assertEqual( test['a'], got, )
def add_positions_from_row(item, person, testing, fix_nums=None): # add position items (up to 6 of them) prev_politics = None for posnum in range(1, 7): # Save the position if we're running all positions or specifically # fixing this one. save_this_position = fix_nums is None or posnum in fix_nums pos_str = " (%s)" % posnum if pd.isnull(item["Court Name" + pos_str]): continue if re.search("appeal", item["Court Name" + pos_str], re.I): courtid = match_court_string(item["Court Name" + pos_str], federal_appeals=True) elif re.search("district|trade", item["Court Name" + pos_str], re.I): courtid = match_court_string(item["Court Name" + pos_str], federal_district=True) if courtid is None: raise Exception date_nominated = process_date_string(item["Nomination Date" + pos_str]) date_recess_appointment = process_date_string( item["Recess Appointment Date" + pos_str]) date_referred_to_judicial_committee = process_date_string( item["Committee Referral Date" + pos_str]) date_judicial_committee_action = process_date_string( item["Committee Action Date" + pos_str]) date_hearing = process_date_string(item["Hearing Date" + pos_str]) date_confirmation = process_date_string(item["Confirmation Date" + pos_str]) # assign start date date_start = process_date_string(item["Commission Date" + pos_str]) if pd.isnull(date_start) and not pd.isnull(date_recess_appointment): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip date_start = None date_termination = process_date_string(item["Termination Date" + pos_str]) termination = item["Termination" + pos_str] if date_termination is None: date_granularity_termination = "" else: date_granularity_termination = GRANULARITY_DAY # check duplicate position dupe_search = Position.objects.filter( person=person, position_type="jud", date_start=date_start, date_termination=date_termination, termination_reason=termination, court_id=courtid, ) if len(dupe_search) > 0: print("Duplicate position:", dupe_search) continue # assign appointing president if not pd.isnull(item["Reappointing President" + pos_str]): appointstr = item["Reappointing President" + pos_str] else: appointstr = item["Appointing President" + pos_str] appointer = None if appointstr not in ["Assignment", "Reassignment"]: names = appointstr.split() if len(names) == 3: first, mid, last = names else: first, last = names[0], names[-1] mid = "" appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, ) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type="pres", ) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type="pres", date_start__lte=date_nominated, date_termination__gte=date_nominated, ) if len(appoint_search) == 0: print(names, appoint_search) if len(appoint_search) > 1: print(names, appoint_search) if len(appoint_search) == 1: appointer = appoint_search[0] # Senate votes data. votes = item["Ayes/Nays" + pos_str] if not pd.isnull(votes): votes_yes, votes_no = votes.split("/") else: votes_yes = None votes_no = None if item["Senate Vote Type" + pos_str] == "Yes": voice_vote = True else: voice_vote = False termdict = { "Abolition of Court": "abolished", "Death": "ded", "Reassignment": "other_pos", "Appointment to Another Judicial Position": "other_pos", "Impeachment & Conviction": "bad_judge", "Recess Appointment-Not Confirmed": "recess_not_confirmed", "Resignation": "resign", "Retirement": "retire_vol", } term_reason = item["Termination" + pos_str] if pd.isnull(term_reason): term_reason = "" else: term_reason = termdict[term_reason] position = Position( person=person, court_id=courtid, position_type="jud", date_nominated=date_nominated, date_recess_appointment=date_recess_appointment, date_referred_to_judicial_committee= date_referred_to_judicial_committee, date_judicial_committee_action=date_judicial_committee_action, date_hearing=date_hearing, date_confirmation=date_confirmation, date_start=date_start, date_granularity_start=GRANULARITY_DAY, date_termination=date_termination, date_granularity_termination=date_granularity_termination, appointer=appointer, voice_vote=voice_vote, votes_yes=votes_yes, votes_no=votes_no, vote_type="s", how_selected="a_pres", termination_reason=term_reason, ) if not testing and save_this_position: position.save() # set party p = item["Party of Appointing President" + pos_str] if not pd.isnull(p) and p not in ["Assignment", "Reassignment"]: party = get_party(item["Party of Appointing President" + pos_str]) if prev_politics is None: if pd.isnull(date_nominated): politicsgran = "" else: politicsgran = GRANULARITY_DAY politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=politicsgran, source="a", ) if not testing and save_this_position: politics.save() prev_politics = party elif party != prev_politics: # account for changing political affiliation politics.date_end = date_nominated politics.date_granularity_end = GRANULARITY_DAY if not testing and save_this_position: politics.save() politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=GRANULARITY_DAY, source="a", ) if not testing and save_this_position: politics.save() rating = get_aba(item["ABA Rating" + pos_str]) if rating is not None: nom_year = date_nominated.year aba = ABARating(person=person, rating=rating, year_rated=nom_year) if not testing and save_this_position: aba.save() # Add URL and date accessed. sources = Source( person=person, url="https://www.fjc.gov/sites/default/files/history/judges.csv", date_accessed=str(date.today()), ) if not testing: sources.save()
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def fix_fjc_positions(self, infile=None): """ Addresses issue #624. We had some errant regexes in the district court assignments. This code reassigns the court fields for these judges where the new regexes differs from the old ones. :param infile: The import file with fjc-data.xslx :return: None """ if infile is None: self.ensure_input_file() infile = self.options["input_file"] textfields = [ "firstname", "midname", "lastname", "gender", "Place of Birth (City)", "Place of Birth (State)", "Place of Death (City)", "Place of Death (State)", ] df = pd.read_excel(infile, 0) for x in textfields: df[x] = df[x].replace(np.nan, "", regex=True) df["Employment text field"].replace( to_replace=r";\sno", value=r", no", inplace=True, regex=True ) for i, item in df.iterrows(): fjc_id = item["Judge Identification Number"] p = Person.objects.get(fjc_id=fjc_id) logger.info( "Doing person with FJC ID: %s, " "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url()) ) exclusions = [] for posnum in range(1, 7): if posnum > 1: pos_str = " (%s)" % posnum else: pos_str = "" if pd.isnull(item["Court Name" + pos_str]): continue courtid = match_court_string( item["Court Name" + pos_str], federal_district=True ) if courtid is None: raise Exception date_termination = process_date_string( item["Date of Termination" + pos_str] ) date_start = process_date_string( item["Commission Date" + pos_str] ) date_recess_appointment = process_date_string( item["Recess Appointment date" + pos_str] ) if pd.isnull(date_start) and not pd.isnull( date_recess_appointment ): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue positions = Position.objects.filter( person=p, date_start=date_start, date_termination=date_termination, position_type="jud", ).exclude(pk__in=exclusions) position_count = positions.count() if position_count < 1: logger.info( "Couldn't find position to match '%s' on '%s' " "with exclusions: %s" % (p, date_start, exclusions) ) add_positions_from_row( item, p, self.debug, fix_nums=[posnum] ) if not self.debug: add_items_to_solr.delay([p.pk], "people_db.Person") continue elif position_count == 1: # Good case. Press on! position = positions[0] exclusions.append(position.pk) elif position_count > 1: logger.info( "Got too many results for '%s' on '%s'. Got %s" % (p, date_start, position_count) ) continue if position.court.pk == courtid: logger.info( "Court IDs are both '%s'. No changes made." % courtid ) else: logger.info( "Court IDs are different! Old: %s, New: %s" % (position.court.pk, courtid) ) court = Court.objects.get(pk=courtid) position.court = court if not self.debug: position.save() add_items_to_solr.delay([p.pk], "people_db.Person")
def add_positions_from_row(item, person, testing, fix_nums=None): # add position items (up to 6 of them) prev_politics = None for posnum in range(1, 7): # Save the position if we're running all positions or specifically # fixing this one. save_this_position = (fix_nums is None or posnum in fix_nums) if posnum > 1: pos_str = ' (%s)' % posnum else: pos_str = '' if pd.isnull(item['Court Name' + pos_str]): continue courtid = match_court_string(item['Court Name' + pos_str], federal_district=True) if courtid is None: raise Exception date_nominated = process_date_string( item['Nomination Date Senate Executive Journal' + pos_str]) date_recess_appointment = process_date_string( item['Recess Appointment date' + pos_str]) date_referred_to_judicial_committee = process_date_string( item['Referral date (referral to Judicial Committee)' + pos_str]) date_judicial_committee_action = process_date_string( item['Committee action date' + pos_str]) date_hearing = process_date_string(item['Hearings' + pos_str]) date_confirmation = process_date_string( item['Senate Vote Date (Confirmation Date)' + pos_str]) # assign start date date_start = process_date_string(item['Commission Date' + pos_str]) if pd.isnull(date_start) and not pd.isnull(date_recess_appointment): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue date_termination = process_date_string(item['Date of Termination' + pos_str]) date_retirement = process_date_string( item['Retirement from Active Service' + pos_str]) if date_termination is None: date_granularity_termination = '' else: date_granularity_termination = GRANULARITY_DAY # check duplicate position dupe_search = Position.objects.filter( person=person, position_type='jud', date_start=date_start, date_termination=date_termination, court_id=courtid, ) if len(dupe_search) > 0: print('Duplicate position:', dupe_search) continue # assign appointing president if not pd.isnull(item['Renominating President name' + pos_str]): appointstr = item['Renominating President name' + pos_str] else: appointstr = item['President name' + pos_str] appointer = None if appointstr not in ['Assignment', 'Reassignment']: names = appointstr.split() if len(names) == 3: first, mid, last = names else: first, last = names[0], names[-1] mid = '' appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type='pres', ) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type='pres', date_start__lte=date_nominated, date_termination__gte=date_nominated) if len(appoint_search) == 0: print(names, appoint_search) if len(appoint_search) > 1: print(names, appoint_search) if len(appoint_search) == 1: appointer = appoint_search[0] # senate votes data votes = item['Senate vote Ayes/Nays' + pos_str] if not pd.isnull(votes): votes_yes, votes_no = votes.split('/') else: votes_yes = None votes_no = None if item['Senate voice vote' + pos_str] == "Yes": voice_vote = True else: voice_vote = False termdict = { 'Abolition of Court': 'abolished', 'Death': 'ded', 'Reassignment': 'other_pos', 'Appointment to Another Judicial Position': 'other_pos', 'Impeachment & Conviction': 'bad_judge', 'Recess Appointment-Not Confirmed': 'recess_not_confirmed', 'Resignation': 'resign', 'Retirement': 'retire_vol' } term_reason = item['Termination specific reason' + pos_str] if pd.isnull(term_reason): term_reason = '' else: term_reason = termdict[term_reason] position = Position( person=person, court_id=courtid, position_type='jud', date_nominated=date_nominated, date_recess_appointment=date_recess_appointment, date_referred_to_judicial_committee= date_referred_to_judicial_committee, date_judicial_committee_action=date_judicial_committee_action, date_hearing=date_hearing, date_confirmation=date_confirmation, date_start=date_start, date_granularity_start=GRANULARITY_DAY, date_termination=date_termination, date_granularity_termination=date_granularity_termination, date_retirement=date_retirement, appointer=appointer, voice_vote=voice_vote, votes_yes=votes_yes, votes_no=votes_no, vote_type='s', how_selected='a_pres', termination_reason=term_reason) if not testing and save_this_position: position.save() # set party p = item['Party Affiliation of President' + pos_str] if not pd.isnull(p) and p not in ['Assignment', 'Reassignment']: party = get_party(item['Party Affiliation of President' + pos_str]) if prev_politics is None: if pd.isnull(date_nominated): politicsgran = '' else: politicsgran = GRANULARITY_DAY politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=politicsgran, source='a', ) if not testing and save_this_position: politics.save() prev_politics = party elif party != prev_politics: # account for changing political affiliation politics.date_end = date_nominated politics.date_granularity_end = GRANULARITY_DAY if not testing and save_this_position: politics.save() politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=GRANULARITY_DAY, source='a') if not testing and save_this_position: politics.save() rating = get_aba(item['ABA Rating' + pos_str]) if rating is not None: nom_year = date_nominated.year aba = ABARating(person=person, rating=rating, year_rated=nom_year) if not testing and save_this_position: aba.save()
def parse_harvard_opinions(reporter, volume): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( list(set(itertools.chain.from_iterable(judge_list + author_list))) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) # Iterate over other xml fields in Harvard data set # and save as string list for further processing at a later date. json_fields = [ "attorneys", "disposition", "syllabus", "summary", "history", "otherdate", "seealso", "headnotes", "correction", ] data_set = {} while json_fields: key = json_fields.pop(0) data_set[key] = "|".join([x.text for x in soup.find_all(key)]) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=data_set["attorneys"], disposition=data_set["disposition"], syllabus=data_set["syllabus"], summary=data_set["summary"], history=data_set["history"], other_dates=data_set["otherdate"], cross_reference=data_set["seealso"], headnotes=data_set["headnotes"], correction=data_set["correction"], judges=judges, filepath_json_harvard=file_path, ) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.reporter][0]["cite_type"] ), cluster_id=cluster.id, ) for op in soup.find_all("opinion"): joined_by_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(judge_list))) ) ) author_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(author_list))) ) ) op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) Opinion.objects.create( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, joined_by_str=joined_by_str, extracted_by_ocr=True, ) logger.info("Finished: %s", citation.base_citation())
def add_positions_from_row(item, person, testing, fix_nums=None): # add position items (up to 6 of them) prev_politics = None for posnum in range(1, 7): # Save the position if we're running all positions or specifically # fixing this one. save_this_position = (fix_nums is None or posnum in fix_nums) if posnum > 1: pos_str = ' (%s)' % posnum else: pos_str = '' if pd.isnull(item['Court Name' + pos_str]): continue courtid = match_court_string(item['Court Name' + pos_str], federal_district=True) if courtid is None: raise Exception date_nominated = process_date_string( item['Nomination Date Senate Executive Journal' + pos_str]) date_recess_appointment = process_date_string( item['Recess Appointment date' + pos_str]) date_referred_to_judicial_committee = process_date_string( item['Referral date (referral to Judicial Committee)' + pos_str]) date_judicial_committee_action = process_date_string( item['Committee action date' + pos_str]) date_hearing = process_date_string(item['Hearings' + pos_str]) date_confirmation = process_date_string( item['Senate Vote Date (Confirmation Date)' + pos_str]) # assign start date date_start = process_date_string(item['Commission Date' + pos_str]) if pd.isnull(date_start) and not pd.isnull(date_recess_appointment): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue date_termination = process_date_string( item['Date of Termination' + pos_str]) date_retirement = process_date_string( item['Retirement from Active Service' + pos_str]) if date_termination is None: date_granularity_termination = '' else: date_granularity_termination = GRANULARITY_DAY # check duplicate position dupe_search = Position.objects.filter( person=person, position_type='jud', date_start=date_start, date_termination=date_termination, court_id=courtid, ) if len(dupe_search) > 0: print('Duplicate position:', dupe_search) continue # assign appointing president if not pd.isnull(item['Renominating President name' + pos_str]): appointstr = item['Renominating President name' + pos_str] else: appointstr = item['President name' + pos_str] appointer = None if appointstr not in ['Assignment', 'Reassignment']: names = appointstr.split() if len(names) == 3: first, mid, last = names else: first, last = names[0], names[-1] mid = '' appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type='pres', ) if len(appoint_search) > 1: appoint_search = Position.objects.filter( person__name_first__iexact=first, person__name_last__iexact=last, person__name_middle__iexact=mid, position_type='pres', date_start__lte=date_nominated, date_termination__gte=date_nominated ) if len(appoint_search) == 0: print(names, appoint_search) if len(appoint_search) > 1: print(names, appoint_search) if len(appoint_search) == 1: appointer = appoint_search[0] # senate votes data votes = item['Senate vote Ayes/Nays' + pos_str] if not pd.isnull(votes): votes_yes, votes_no = votes.split('/') else: votes_yes = None votes_no = None if item['Senate voice vote' + pos_str] == "Yes": voice_vote = True else: voice_vote = False termdict = {'Abolition of Court': 'abolished', 'Death': 'ded', 'Reassignment': 'other_pos', 'Appointment to Another Judicial Position': 'other_pos', 'Impeachment & Conviction': 'bad_judge', 'Recess Appointment-Not Confirmed': 'recess_not_confirmed', 'Resignation': 'resign', 'Retirement': 'retire_vol' } term_reason = item['Termination specific reason' + pos_str] if pd.isnull(term_reason): term_reason = '' else: term_reason = termdict[term_reason] position = Position( person=person, court_id=courtid, position_type='jud', date_nominated=date_nominated, date_recess_appointment=date_recess_appointment, date_referred_to_judicial_committee=date_referred_to_judicial_committee, date_judicial_committee_action=date_judicial_committee_action, date_hearing=date_hearing, date_confirmation=date_confirmation, date_start=date_start, date_granularity_start=GRANULARITY_DAY, date_termination=date_termination, date_granularity_termination=date_granularity_termination, date_retirement=date_retirement, appointer=appointer, voice_vote=voice_vote, votes_yes=votes_yes, votes_no=votes_no, vote_type='s', how_selected='a_pres', termination_reason=term_reason ) if not testing and save_this_position: position.save() # set party p = item['Party Affiliation of President' + pos_str] if not pd.isnull(p) and p not in ['Assignment', 'Reassignment']: party = get_party(item['Party Affiliation of President' + pos_str]) if prev_politics is None: if pd.isnull(date_nominated): politicsgran = '' else: politicsgran = GRANULARITY_DAY politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=politicsgran, source='a', ) if not testing and save_this_position: politics.save() prev_politics = party elif party != prev_politics: # account for changing political affiliation politics.date_end = date_nominated politics.date_granularity_end = GRANULARITY_DAY if not testing and save_this_position: politics.save() politics = PoliticalAffiliation( person=person, political_party=party, date_start=date_nominated, date_granularity_start=GRANULARITY_DAY, source='a' ) if not testing and save_this_position: politics.save() rating = get_aba(item['ABA Rating' + pos_str]) if rating is not None: nom_year = date_nominated.year aba = ABARating( person=person, rating=rating, year_rated=nom_year ) if not testing and save_this_position: aba.save()
def fix_fjc_positions(self, infile=None): """ Addresses issue #624. We had some errant regexes in the district court assignments. This code reassigns the court fields for these judges where the new regexes differs from the old ones. :param infile: The import file with fjc-data.xslx :return: None """ if infile is None: self.ensure_input_file() infile = self.options['input_file'] textfields = ['firstname', 'midname', 'lastname', 'gender', 'Place of Birth (City)', 'Place of Birth (State)', 'Place of Death (City)', 'Place of Death (State)'] df = pd.read_excel(infile, 0) for x in textfields: df[x] = df[x].replace(np.nan, '', regex=True) df['Employment text field'].replace(to_replace=r';\sno', value=r', no', inplace=True, regex=True) for i, item in df.iterrows(): fjc_id = item['Judge Identification Number'] p = Person.objects.get(fjc_id=fjc_id) logger.info("Doing person with FJC ID: %s, " "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url())) exclusions = [] for posnum in range(1, 7): if posnum > 1: pos_str = ' (%s)' % posnum else: pos_str = '' if pd.isnull(item['Court Name' + pos_str]): continue courtid = match_court_string(item['Court Name' + pos_str], federal_district=True) if courtid is None: raise Exception date_termination = process_date_string( item['Date of Termination' + pos_str]) date_start = process_date_string( item['Commission Date' + pos_str]) date_recess_appointment = process_date_string( item['Recess Appointment date' + pos_str]) if pd.isnull(date_start) and not pd.isnull( date_recess_appointment): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue positions = (Position.objects .filter(person=p, date_start=date_start, date_termination=date_termination, position_type='jud') .exclude(pk__in=exclusions)) position_count = positions.count() if position_count < 1: logger.info("Couldn't find position to match '%s' on '%s' " "with exclusions: %s" % (p, date_start, exclusions)) add_positions_from_row(item, p, self.debug, fix_nums=[posnum]) if not self.debug: add_items_to_solr.delay([p.pk], 'people_db.Person') continue elif position_count == 1: # Good case. Press on! position = positions[0] exclusions.append(position.pk) elif position_count > 1: logger.info("Got too many results for '%s' on '%s'. Got %s" % (p, date_start, position_count)) continue if position.court.pk == courtid: logger.info("Court IDs are both '%s'. No changes made." % courtid) else: logger.info("Court IDs are different! Old: %s, New: %s" % (position.court.pk, courtid)) court = Court.objects.get(pk=courtid) position.court = court if not self.debug: position.save() add_items_to_solr.delay([p.pk], 'people_db.Person')