def make_ucid_and_weak(docket, office, district, case_type): ''' Make the ucid and weak_ucid from idb data. Can take str values for a single row, or can take series and output as Series. Inputs: - docket (str or Series): idb 'docket' looks like 1600123 for year=16, case_no=00123 - office (str or Series): idb office # - district (str or Series): court abbreviation e.g. 'ilnd' - case_type (str): 'cv' or 'cr' Outputs: - ucid (str or Series): looks like 'ilnd;;1:16-cv-00123' - ucid_weak (str or Series): ucid with office removed, looks like 'ilnd;;16-cv-00001' ''' # Find the ucid and weak_ucid # data = {k:row[columns.index(k)] for k in ['DOCKET', 'DISTRICT', 'OFFICE']} if type(docket) is pd.Series: case_year = docket.str.slice(0, 2) case_no = docket.str.slice(2, ) else: case_year = docket[:2] case_no = docket[2:] # court = IDB_COLS['DISTRICT']['conv'](data['DISTRICT']) ucid = dtools.ucid_from_scratch(district, office, case_year, case_type, case_no) ucid_weak = dtools.get_ucid_weak(ucid) return ucid, ucid_weak
def idb_merge(idb_data_file, case_type, preloaded_idb_data_file=None, dframe=None): ''' Merge dataframe of cases with idb data Inputs - idb_data_file (str or Path): the idb csv file to use e.g. 'cv10to19.csv' - case_type (str): the case type ('cv' or 'cr') of the cases in the idb file provided - preloaded_idb_data_file (DataFrame): specify a preloaded IDB dataframe, e.g. if the consumer has already called load_idb_csv - dframe (DataFrame): specify table of case files, instead of using all of unique files table Outputs - final (DataFrame): the merged table - match_rate (float): the no. of original casefiles matched against idb ''' if dframe is None: dff = dtools.load_unique_files_df() dff = dff[dff.case_type.eq(case_type)].copy() else: dff = dframe.copy() N = dff.shape[0] print(f"\n{N:,} SCALES cases provided") # Make sure there's a ucid column dff.reset_index(inplace=True) dff['ucid_copy'] = dff['ucid'].copy() if preloaded_idb_data_file is not None: df_idb = preloaded_idb_data_file else: print(f'Loading idb file: {idb_data_file}...') df_idb = load_idb_csv(idb_data_file, case_type=case_type, cols=BARE_MIN_COLS) df_idb.sort_values(['ucid', 'filedate'], inplace=True) df_idb.drop_duplicates('ucid', keep='first', inplace=True) #Stage 1 (matching on ucid) print(f'STAGE 1: matching on ucid...') matched_mask = dff.ucid.isin(df_idb.ucid) matched_ucids = dff.ucid[matched_mask] keepcols = [ 'fpath', 'case_type', 'filing_date', 'terminating_date', 'source', *[x.lower() for x in BARE_MIN_COLS] ] # *[x.lower() for x in get_recap_idb_cols(case_type)] ] if 'nos_subtype' in dff.columns: keepcols.append('nos_subtype') # Make table of data merged on ucid print(f'STAGE 1: merging...') merged_ucid = dff[matched_mask].merge(df_idb, how='inner', left_on='ucid', right_on='ucid')\ .set_index('ucid_copy')[keepcols] print( f'STAGE 1: {{matched:{sum(matched_mask):,}, unmatched:{sum(~matched_mask):,} }}' ) # Reduce dff to unmatched dff = dff[~matched_mask].copy() # Create weak ucid dff['ucid_weak'] = dtools.get_ucid_weak(dff.ucid) # Remove matched from df_idb and reduce to weak_ucid match print(f'STAGE 2: matching on weak_ucid...') df_idb = df_idb[~df_idb.ucid.isin(matched_ucids) & df_idb.ucid_weak.isin(dff.ucid_weak)] # Stage 2 (matching on ucid_weak and filing date) print(f'STAGE 2: merging...') merged_weak = dff.merge(df_idb, how="inner", left_on=['ucid_weak','filing_date'], right_on=['ucid_weak', 'filedate'])\ .set_index('ucid_copy')[keepcols] matched_stage2 = merged_weak.shape[0] print( f"STAGE 2 {{matched:{matched_stage2:,}, unmatched:{sum(~matched_mask) -matched_stage2 :,} }}" ) final = pd.concat([merged_ucid, merged_weak]) del dff, df_idb match_rate = final.shape[0] / N print(f"Overall match rate: {match_rate :.2%}") return final, match_rate
def split_txt(old_file, out_dir, case_type, year_lb=0, nrows=None, year_var='DOCKET'): ''' Cut one of the large .txt tab-delimited IDB datasets into multiple csv files, by year. Inputs: - old_file(str or Path): the .txt file to be split - out_dir (str or Path): the output directory for new csv files - case_type ('cv' or 'cr') - year_lb (int): lower bound on year, to filter out rows with filedate below - nrows (int): max number of rows to write (for testing small samples) - year_var ('DOCKET', 'FILEDATE'): which IDB varibale to get the year from (for file splitting) ''' # Create directory if it doesn't exist out_dir = Path(out_dir).resolve() if not out_dir.exists(): out_dir.mkdir() with open(old_file, 'r+', encoding='ISO-8859-1') as rfile: # Get the column headers from the first line columns = rfile.readline().rstrip('\n').split('\t') ind_filedate = columns.index('FILEDATE') write_count = 0 # Session dictionary to map year to open csv writers session = {} for line in rfile.readlines(): # Extract the data in the line row = line.rstrip('\n').split('\t') if len(row) != len(columns): # Error, skip row continue # Filter by year lower bound file_year = int(row[ind_filedate].split('/')[-1]) if file_year < year_lb: continue if year_var == 'FILEDATE': split_year = file_year elif year_var == 'DOCKET': # Use the year from the DOCKET variable e.g.g 1600001 -> 16 ind_docket = columns.index('DOCKET') split_year = row[ind_docket][:2] else: raise ValueError("`year_var` must be in ('FILEDATE','DOCKET')") # Check if we have a csv for 'year' and if not, start it up if split_year not in session.keys(): filepath = out_dir / f"{case_type}{split_year}.csv" session[split_year] = { 'file': open(filepath, 'w', encoding="utf-8", newline='\n') } session[split_year]['writer'] = csv.writer( session[split_year]['file']) # Write the header row for this new file session[split_year]['writer'].writerow( ['ucid', 'ucid_weak', *columns]) # Find the ucid and weak_ucid data = { k: row[columns.index(k)] for k in ['DOCKET', 'DISTRICT', 'OFFICE'] } case_year = data['DOCKET'][:2] case_no = data['DOCKET'][2:] court = IDB_COLS['DISTRICT']['conv'](data['DISTRICT']) ucid = dtools.ucid_from_scratch(court, data['OFFICE'], case_year, case_type, case_no) ucid_weak = dtools.get_ucid_weak(ucid) # Write the new row, which is (ucid, ucid_weak, <<original row data>>) session[split_year]['writer'].writerow([ucid, ucid_weak, *row]) write_count += 1 if nrows: if write_count >= nrows: break for v in session.values(): v['file'].close()