def get_case_matches(fpath, patterns, wide_net, computed_attrs={}, rlim=None, wide_net_fn=None, skip_non_matches=False): ''' Process a case and return observation rows Output: (list) of obersvation rows (dicts) ''' case_rows = [] case = dtools.load_case(fpath) for ind, line in enumerate(case['docket']): if wide_net_match_line(line, case, wide_net, wide_net_fn): # Use row builder row = row_builder(docket_line=line, ind=ind, case=case, fpath=fpath, patterns=patterns, computed_attrs=computed_attrs, rlim=rlim) if skip_non_matches: # Only add row if at least one pattern match if not any(v for k, v in row.items() if k in patterns): continue case_rows.append(row)
def build_seal_idx(): ''' Build a text file with a list of indexes (from unique file df) of cases that contain the words 'seal/redact/protetive/restriced' ''' seal_idx = [] dff = dt.load_unique_files_df() for i,row in tqdm(dff.iterrows(),total=dff.shape[0]): case = dt.load_case(row.fpath) if dof.find_pattern(case.get('docket', []), RE_WIDE_NET, rlim=RLIM):
def identify_judge_entriesv1(jfhandle=None, docket=None, djudge=''): ''' Attributes each docket entry to a judge. V1 -- Regex rules to identify docket 'chunks' and then do block attribution to chunk Returned [[district, case_id, judge_name, nos, clean_nos, entry number, entry text], ...] input: * jfhandle -- str (opt), Filename for the json file * docket -- list (opt), default is None. Will supercede the json file output: * judge_ind_entries -- list, [judge name, judge name, ...] ''' def _clean_name(judge_name, punc=True): #deletions titles = [ 'Judge ', 'Senior Judge ', 'Magistrate Judge ', 'Chief Judge ', 'Honorable ' ] puncs = ['\.', ','] #Clean the titles try: for title in titles: if title in judge_name: judge_name = judge_name.split(title)[-1] if punc == True: for punc in puncs: judge_name = re.sub(punc, '', judge_name) judge_name = judge_name.strip(' ') except TypeError: pass return judge_name import json import re import spacy import sys sys.path.append('..') import support.data_tools as dtools import support.settings as settings #Input check if jfhandle == None and docket == None: print( 'INPUT ERROR: No docket or json filehandle provided. Returning None' ) return None #Load the data if not docket: case = dtools.load_case(jfhandle) docket = case['docket'] djudge = case['judge']
def main(outfile, sample_n, year_inp, court_inp ,nos_inp, allow_non_matches): ''' Process all of the courts to build dataset ''' # Gather the filepaths csv files_df = dt.load_unique_files_df() # Filter by relevant "seal" cases files_df = filter_cases_seal(files_df).copy() if year_inp: files_df = files_df[files_df.year==year_inp].copy() print(f'Running only on cases from {year_inp}, reduced dataset to {len(files_df):,} cases') if court_inp: files_df = files_df[files_df.court==court_inp].copy() print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases') if nos_inp: files_df = files_df[files_df.nature_suit.fillna('').str.startswith( str(nos_inp) )].copy() print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases') # If sample size specified, run on random subset if sample_n: files_df = files_df.sample(sample_n).copy() print(f'Running on random subset of size {sample_n:,}') print(f'Processing {len(files_df):,} cases...\n') #Build the csv file line-by-line out_file_name = outfile col_names = ['court', 'judge', 'case_id', 'ucid', 'line_ind','fpath', 'case_type', 'nature_suit','text', 'date_docket_line', 'days_from_filing', 'is_multi','is_mdl','mdl_code', *pats.keys()] w_count = 0 #Keep count of lines written with open(out_file_name, 'w+', encoding="utf-8") as wfile: writer = csv.writer(wfile) writer.writerow(col_names) # Iterate through all relevant files for i, row in tqdm(files_df.iterrows(), total=len(files_df), desc="Files Processed"): case = dt.load_case(row.fpath) if 'docket' not in case.keys(): continue if type(case['docket'][0])==list: tqdm.write(i) continue
def _update_case_(row, indent): ''' Update a single case file json with idb data Inputs: - row (Series or dict): row of the merged dataframe that contains data on the case (fpath,case_type and all idb_recap columns needed) - indent (int): size of indent if pretty printing ''' # Actually fix incorrect data if row.recap: # Get the case and update the idb_data key case = dtools.load_case(row.fpath, recap_orig=True) case['idb_data'] = extract_recap_idb_data(row, row.case_type) # Update the outer json with idb_data for key in ['date_filed', 'date_terminated', 'nature_of_suit']: case[key] = case['idb_data'][key]
def build_df_dur(df, year=2016): ''' Build table with relevant case duration data (need to open each case to verify latest date) Inputs: - df (pd.DataFrame): the main docketline level dataframe of sealed data - year (int): the year ''' dff = dtools.load_unique_files_df() # Get the subset of cases from unique files table that are patent cases cases_pat = dff[dff.nature_suit.eq(PATENT_NOS) & dff.year.eq(year) & ~dff.is_multi.eq(True)].copy() cases_pat['is_txed'] = cases_pat.court.eq('txed') duration = [] for ucid, row in tqdm(cases_pat.iterrows(), total=cases_pat.shape[0]): case = dtools.load_case(row.fpath) if not case.get('docket'): continue
def identify_judge_entriesv2(jfhandle=None, docket=None): ''' Attributes each docket entry to a judge. V2 -- Spacy langauge model to identify judge names on per entry basis. Entries without attribution are done via a forward and then backwards backfilling routine, assigning the closest attributed judge to the entry Returns [judge name, judge name, ... ] that will be the same length as the inputed docket input: * jfhandle -- str (opt), Filename for the json file * docket -- list (opt), default is None. Will supercede the json file output: * judge_ind_entries -- list, [judge name, judge name, ...] ''' import json import re import spacy import sys sys.path.append('..') import support.data_tools as dtools import support.settings as settings exclusions = [ 'EXECUTIVE COMMITTEE', 'Executive Committee', 'executive committee', 'GENERAL', 'General', 'general' ] #Input check if jfhandle == None and docket == None: print( 'INPUT ERROR: No docket or json filehandle provided. Returning None' ) return None #Load the data if not docket: case = dtools.load_case(jfhandle) docket = case['docket']
the case (fpath,case_type and all idb_recap columns needed) - indent (int): size of indent if pretty printing ''' # Actually fix incorrect data if row.recap: # Get the case and update the idb_data key case = dtools.load_case(row.fpath, recap_orig=True) case['idb_data'] = extract_recap_idb_data(row, row.case_type) # Update the outer json with idb_data for key in ['date_filed', 'date_terminated', 'nature_of_suit']: case[key] = case['idb_data'][key] else: # Pacer: just add in data case = dtools.load_case(row.fpath) case['idb_data'] = extract_recap_idb_data(row, row.case_type) with open(settings.PROJECT_ROOT / row.fpath, 'w+', encoding='utf-8') as wfile: simplejson.dump(case, wfile, ignore_nan=True, indent=indent) def execute_idb_merge(merged_df): ''' Update casefiles from an idb merge Inputs - merged_df (DataFrame): a merged dataframe, output from idb_merge '''
def bundler(indf, name, notes=None, overwrite=False, anno_col=None): ''' Bundle up a collection of files Inputs: - indf (DataFrame): any dataframe with an fpath column to identify files - name (str): name of directory to bundle into (will be put in /data/{name}) - notes (str): notes to be injected under the header (html string) - anno_col (str): name of annotations column if any, column should be valid json string ''' df = indf.copy() # Want to include the index if it's ucid if df.index.name == 'ucid': df = df.reset_index() if anno_col: # import pdb;pdb.set_trace() df[anno_col] = df[anno_col].map(json.loads) # Columns needed to generate if 'fpath' not in df.columns: raise ValueError( 'DataFrame must include fpath column to point to file locations') elif 'ucid' not in df.columns: raise ValueError('DataFrame must include ucid to identify case') # Handle directory bundle_dir = settings.BUNDLES / name if bundle_dir.exists(): if overwrite: # Delete all files in the directory for file in bundle_dir.iterdir(): file.unlink() else: raise ValueError(f'The directory {str(bundle_dir)} already exists') else: bundle_dir.mkdir() # Start building html index page with strings heading = f"<h1 class='heading'>Data Dump: {name}</h1>" notes = f'''<div class="notes">NOTES: {notes}</div>''' if notes else '' opening = f"<html>{index_style()}<body>{heading}{notes}" # Start building table rows table_rows = [] header = [f"<th>{val}</th>" for val in df.columns if val != anno_col] table_rows.append("".join(header)) for i, row in tqdm(df.iterrows(), total=len(df)): # Get filepath rel_path = row.fpath if type(rel_path) is str: rel_path = Path(rel_path.replace('\\', '/')) abs_path = settings.PROJECT_ROOT / rel_path # Annotation scenario if 'pacer' in abs_path.parts and anno_col and row[anno_col]: # Load the html text and json data to make the annotated docket hpath = dtools.get_pacer_html(abs_path) html_text = open(hpath, 'r', encoding='utf-8').read() json_data = dtools.load_case(row.fpath) new_html = make_annotated_docket(html_text, json_data, row[anno_col]) # Copy the new (annotated) html into the bundle directory tqdm.write(f"Annotating {row.ucid}") new_name = row.ucid.replace(':', '-') + '.html' with open(bundle_dir / new_name, 'w', encoding='utf-8') as wfile: wfile.write(new_html) else: if 'pacer' in abs_path.parts: # Get the path to the html file abs_path = dtools.get_pacer_html(abs_path) # Copy the file tqdm.write(f"Copying {row.ucid}") new_name = row.ucid.replace(':', '-') + abs_path.suffix shutil.copyfile(abs_path, bundle_dir / new_name) cells = [f"<td>{v}</td>" for k, v in row.iteritems() if k != anno_col] row_string = f'''<tr onclick="window.open('{new_name}')">''' + "".join( cells) + "</tr>" table_rows.append(row_string) # Finish out the html string for the index table = f"<table class='maintable'>{''.join(table_rows)}</table>" closing = f"</body></html>" html = opening + table + closing with open(bundle_dir / '_index.html', 'w+') as wfile: wfile.write(html) print(f"\nFiles Succesfully bundled into {bundle_dir}")