def get_judges(self, node): """Parse out the judge string and then look it up in the DB""" try: s = self.case_details.xpath('%s/text()' % node)[0].strip() except IndexError: print " Couldn't get judge for node: %s" % node return None, '' else: judge_names = find_judge_names(s) judges = [] for judge_name in judge_names: judges.append( find_person(judge_name, self.court.pk, case_date=self.date_filed)) judges = [c for c in judges if c is not None] if len(judges) == 0: print " No judges found after lookup." logger.info("No judge for: %s" % ((s, self.court.pk, self.date_filed), )) return None, s elif len(judges) == 1: return judges[0], s elif len(judges) > 1: print " Too many judges found: %s" % len(judges) return None, s
def get_judges(self, node): """Parse out the judge string and then look it up in the DB""" try: s = self.case_details.xpath('%s/text()' % node)[0].strip() except IndexError: print " Couldn't get judge for node: %s" % node return None, '' else: judge_names = find_judge_names(s) judges = [] for judge_name in judge_names: judges.append(find_person(judge_name, self.court.pk, case_date=self.date_filed)) judges = [c for c in judges if c is not None] if len(judges) == 0: print " No judges found after lookup." logger.info("No judge for: %s" % ( (s, self.court.pk, self.date_filed), )) return None, s elif len(judges) == 1: return judges[0], s elif len(judges) > 1: print " Too many judges found: %s" % len(judges) return None, s
def get_candidate_judge_objects(judge_str, court_id, event_date): """Take a string of text in a time and place and figure out which judges match up to it. """ judges = find_judge_names(judge_str) if len(judges) == 0: return [] candidates = [] for judge in judges: candidates.append(find_person(judge, court_id, case_date=event_date)) return [c for c in candidates if c is not None]
def test_get_judge_from_string_columbia(self): """Can we cleanly get a judge value from a string?""" tests = ( ( "CLAYTON <italic>Ch. Jus. of the Superior Court,</italic> " "delivered the following opinion of this Court: ", ["clayton"], ), ("OVERTON, J. — ", ["overton"]), ("BURWELL, J.:", ["burwell"]), ) for q, a in tests: self.assertEqual(find_judge_names(q), a)
def test_get_judge_from_string_columbia(self): """Can we cleanly get a judge value from a string?""" tests = (( 'CLAYTON <italic>Ch. Jus. of the Superior Court,</italic> ' 'delivered the following opinion of this Court: ', ['clayton'], ), ( 'OVERTON, J. — ', ['overton'], ), ( 'BURWELL, J.:', ['burwell'], )) for q, a in tests: self.assertEqual(find_judge_names(q), a)
def get_candidate_judges(judge_str, court_id, event_date): """Figure out who a judge is from a string and some metadata. :param judge_str: A string containing the judge's name. :param court_id: A CL Court ID where the case occurred. :param event_date: The date of the case. :return: Tuple consisting of (Judge, judge_str), where Judge is a judge object or None if a judge cannot be identified, and s is the original string passed in. """ if not judge_str: return None judges = find_judge_names(judge_str) if len(judges) == 0: return [] candidates = [] for judge in judges: candidates.append(find_person(judge, court_id, case_date=event_date)) return [c for c in candidates if c is not None]
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]]) if OpinionCluster.objects.filter( filepath_json_harvard=file_path).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info("No citation found for %s." % data["citations"][0]["cite"]) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set(itertools.chain.from_iterable(judge_list + author_list))))) judges = titlecase(judges) docket_string = (data["docket_number"].replace( "Docket No.", "").replace("Docket Nos.", "").strip()) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...")) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"]), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase("".join( find_judge_names(author_tag_str))) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
df = pd.read_csv('/vagrant/flp/columbia_data/judges/fed-judges-test.csv') cas = ['ca' + str(n) for n in range(1, 12)] matchcount = 0 panelcount = 0 zerocount = 0 for i, row in df.iterrows(): #if row.court_id not in cas: # continue if pd.isnull(row.judges): continue judges = find_judge_names(row.judges) date_filed = dt.strptime(row.date_filed, "%Y-%m-%d") candidates = [] for judge in judges: candidates.append( find_person(judge, row.court_id, case_date=date_filed)) candidates = [c for c in candidates if c is not None] if len(candidates) == 1: author = candidates[0] print(author) elif len(candidates) > 1: panel = candidates print(panel) else:
def assign_authors(testing=False): clusters = (OpinionCluster.objects.exclude(judges='').exclude( docket__court__jurisdiction='FB').select_related( 'docket__court__id').only('date_filed', 'judges', 'docket__court_id')) total = clusters.count() i = 0 for cluster in clusters: i += 1 print u"(%s/%s): Processing: %s, %s" % (i, total, cluster.pk, cluster.date_filed) #print u" Judge string: %s".encode('utf-8') % cluster.judges judgestr = unidecode(cluster.judges) print " Judge string: %s" % judgestr if 'curiam' in judgestr.lower(): opinion = cluster.sub_opinions.all()[0] opinion.per_curiam = True print u' Per Curiam assigned.' if not testing: opinion.save(index=False) continue #judges = find_judge_names(cluster.judges) judges = find_judge_names(judgestr) if len(judges) == 0: continue candidates = [] for judge in judges: candidates.append( find_person(judge, cluster.docket.court_id, case_date=cluster.date_filed)) candidates = [c for c in candidates if c is not None] if len(candidates) == 0: # more than one judge token, but no DB matches, continue print u' No match.' continue if len(candidates) > 1: # more than one DB match, assign panel and continue print u' Panel assigned: %s' % candidates if not testing: for candidate in candidates: cluster.panel.add(candidate) continue # only one candidate, assign author opinion = cluster.sub_opinions.all()[0] if len(judges) == 1: # one judge token, one DB match opinion.author = candidates[0] print ' Author assigned: %s' % unidecode(str(candidates[0])) else: # multiple judge tokens, one DB match opinion.author = candidates[0] print ' Author assigned: %s (with %d missing tokens)' % ( unidecode(str(candidates[0])), len(judges) - 1) if not testing: opinion.save(index=False)
def parse_harvard_opinions(reporter, volume): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( list(set(itertools.chain.from_iterable(judge_list + author_list))) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) # Iterate over other xml fields in Harvard data set # and save as string list for further processing at a later date. json_fields = [ "attorneys", "disposition", "syllabus", "summary", "history", "otherdate", "seealso", "headnotes", "correction", ] data_set = {} while json_fields: key = json_fields.pop(0) data_set[key] = "|".join([x.text for x in soup.find_all(key)]) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=data_set["attorneys"], disposition=data_set["disposition"], syllabus=data_set["syllabus"], summary=data_set["summary"], history=data_set["history"], other_dates=data_set["otherdate"], cross_reference=data_set["seealso"], headnotes=data_set["headnotes"], correction=data_set["correction"], judges=judges, filepath_json_harvard=file_path, ) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.reporter][0]["cite_type"] ), cluster_id=cluster.id, ) for op in soup.find_all("opinion"): joined_by_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(judge_list))) ) ) author_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(author_list))) ) ) op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) Opinion.objects.create( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, joined_by_str=joined_by_str, extracted_by_ocr=True, ) logger.info("Finished: %s", citation.base_citation())
def assign_authors(testing=False): clusters = (OpinionCluster.objects .exclude(judges='') .exclude(docket__court__jurisdiction='FB') .select_related('docket__court__id') .only('date_filed', 'judges', 'docket__court_id')) total = clusters.count() i = 0 for cluster in clusters: i += 1 print u"(%s/%s): Processing: %s, %s" % (i, total, cluster.pk, cluster.date_filed) #print u" Judge string: %s".encode('utf-8') % cluster.judges judgestr = unidecode(cluster.judges) print " Judge string: %s" % judgestr if 'curiam' in judgestr.lower(): opinion = cluster.sub_opinions.all()[0] opinion.per_curiam = True print u' Per Curiam assigned.' if not testing: opinion.save(index=False) continue #judges = find_judge_names(cluster.judges) judges = find_judge_names(judgestr) if len(judges) == 0: continue candidates = [] for judge in judges: candidates.append(find_person(judge, cluster.docket.court_id, case_date=cluster.date_filed)) candidates = [c for c in candidates if c is not None] if len(candidates) == 0: # more than one judge token, but no DB matches, continue print u' No match.' continue if len(candidates) > 1: # more than one DB match, assign panel and continue print u' Panel assigned: %s' % candidates if not testing: for candidate in candidates: cluster.panel.add(candidate) continue # only one candidate, assign author opinion = cluster.sub_opinions.all()[0] if len(judges) == 1: # one judge token, one DB match opinion.author = candidates[0] print ' Author assigned: %s' % unidecode(str(candidates[0])) else: # multiple judge tokens, one DB match opinion.author = candidates[0] print ' Author assigned: %s (with %d missing tokens)' % ( unidecode(str(candidates[0])), len(judges)-1 ) if not testing: opinion.save(index=False)
df = pd.read_csv('/vagrant/flp/columbia_data/judges/fed-judges-test.csv') cas = ['ca'+str(n) for n in range(1,12)] matchcount = 0 panelcount = 0 zerocount = 0 for i, row in df.iterrows(): #if row.court_id not in cas: # continue if pd.isnull(row.judges): continue judges = find_judge_names(row.judges) date_filed = dt.strptime(row.date_filed, "%Y-%m-%d") candidates = [] for judge in judges: candidates.append(find_person(judge, row.court_id, case_date=date_filed)) candidates = [c for c in candidates if c is not None] if len(candidates) == 1: author = candidates[0] print(author) elif len(candidates) > 1: panel = candidates print(panel) else: print('No match.',row.judges)