예제 #1
0
def get_case_matches(fpath,
                     patterns,
                     wide_net,
                     computed_attrs={},
                     rlim=None,
                     wide_net_fn=None,
                     skip_non_matches=False):
    '''
    Process a case and return observation rows

    Output:
    (list) of obersvation rows (dicts)
    '''

    case_rows = []
    case = dtools.load_case(fpath)

    for ind, line in enumerate(case['docket']):

        if wide_net_match_line(line, case, wide_net, wide_net_fn):
            # Use row builder
            row = row_builder(docket_line=line,
                              ind=ind,
                              case=case,
                              fpath=fpath,
                              patterns=patterns,
                              computed_attrs=computed_attrs,
                              rlim=rlim)

            if skip_non_matches:
                # Only add row if at least one pattern match
                if not any(v for k, v in row.items() if k in patterns):
                    continue

            case_rows.append(row)
def build_seal_idx():
    '''
    Build a text file with a list of indexes (from unique file df)
    of cases that contain the words 'seal/redact/protetive/restriced'
    '''
    seal_idx = []
    dff = dt.load_unique_files_df()

    for i,row in tqdm(dff.iterrows(),total=dff.shape[0]):
        case = dt.load_case(row.fpath)
        if dof.find_pattern(case.get('docket', []), RE_WIDE_NET, rlim=RLIM):
def identify_judge_entriesv1(jfhandle=None, docket=None, djudge=''):
    '''
    Attributes each docket entry to a judge.
    V1 -- Regex rules to identify docket 'chunks' and then do block attribution to chunk
          Returned [[district, case_id, judge_name, nos, clean_nos, entry number, entry text], ...]
    input:
       * jfhandle -- str (opt), Filename for the json file
       * docket -- list (opt), default is None. Will supercede the json file
    output:
       * judge_ind_entries -- list, [judge name, judge name, ...]
    '''
    def _clean_name(judge_name, punc=True):
        #deletions
        titles = [
            'Judge ', 'Senior Judge ', 'Magistrate Judge ', 'Chief Judge ',
            'Honorable '
        ]
        puncs = ['\.', ',']
        #Clean the titles
        try:
            for title in titles:
                if title in judge_name:
                    judge_name = judge_name.split(title)[-1]
            if punc == True:
                for punc in puncs:
                    judge_name = re.sub(punc, '', judge_name)
            judge_name = judge_name.strip(' ')
        except TypeError:
            pass
        return judge_name

    import json
    import re
    import spacy
    import sys
    sys.path.append('..')
    import support.data_tools as dtools
    import support.settings as settings

    #Input check
    if jfhandle == None and docket == None:
        print(
            'INPUT ERROR: No docket or json filehandle provided. Returning None'
        )
        return None

    #Load the data
    if not docket:
        case = dtools.load_case(jfhandle)
        docket = case['docket']
        djudge = case['judge']
def main(outfile, sample_n, year_inp, court_inp ,nos_inp, allow_non_matches):
    '''
    Process all of the courts to build dataset
    '''
    # Gather the filepaths csv
    files_df = dt.load_unique_files_df()

    # Filter by relevant "seal" cases
    files_df = filter_cases_seal(files_df).copy()

    if year_inp:
        files_df = files_df[files_df.year==year_inp].copy()
        print(f'Running only on cases from {year_inp}, reduced dataset to {len(files_df):,} cases')

    if court_inp:
        files_df = files_df[files_df.court==court_inp].copy()
        print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases')

    if nos_inp:
        files_df = files_df[files_df.nature_suit.fillna('').str.startswith( str(nos_inp) )].copy()
        print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases')

    # If sample size specified, run on random subset
    if sample_n:
        files_df = files_df.sample(sample_n).copy()
        print(f'Running on random subset of size {sample_n:,}')

    print(f'Processing {len(files_df):,} cases...\n')

    #Build the csv file line-by-line
    out_file_name = outfile
    col_names = ['court', 'judge', 'case_id', 'ucid', 'line_ind','fpath', 'case_type',
                 'nature_suit','text', 'date_docket_line', 'days_from_filing',
                 'is_multi','is_mdl','mdl_code', *pats.keys()]
    w_count = 0  #Keep count of lines written

    with open(out_file_name, 'w+', encoding="utf-8") as wfile:
        writer = csv.writer(wfile)
        writer.writerow(col_names)

        # Iterate through all relevant files
        for i, row in tqdm(files_df.iterrows(), total=len(files_df), desc="Files Processed"):

            case = dt.load_case(row.fpath)
            if 'docket' not in case.keys():
                continue

            if type(case['docket'][0])==list:
                tqdm.write(i)
                continue
예제 #5
0
def _update_case_(row, indent):
    '''
    Update a single case file json with idb data

    Inputs:
        - row (Series or dict): row of the merged dataframe that contains data on
                    the case (fpath,case_type and all idb_recap columns needed)
        - indent (int): size of indent if pretty printing
    '''
    # Actually fix incorrect data
    if row.recap:
        # Get the case and update the idb_data key
        case = dtools.load_case(row.fpath, recap_orig=True)
        case['idb_data'] = extract_recap_idb_data(row, row.case_type)

        # Update the outer json with idb_data
        for key in ['date_filed', 'date_terminated', 'nature_of_suit']:
            case[key] = case['idb_data'][key]
def build_df_dur(df, year=2016):
    '''
    Build table with relevant case duration data (need to open each case to verify latest date)
    Inputs:
        - df (pd.DataFrame): the main docketline level dataframe of sealed data
        - year (int): the year

    '''

    dff = dtools.load_unique_files_df()
    # Get the subset of cases from unique files table that are patent cases
    cases_pat = dff[dff.nature_suit.eq(PATENT_NOS) & dff.year.eq(year)
                    & ~dff.is_multi.eq(True)].copy()
    cases_pat['is_txed'] = cases_pat.court.eq('txed')

    duration = []
    for ucid, row in tqdm(cases_pat.iterrows(), total=cases_pat.shape[0]):
        case = dtools.load_case(row.fpath)
        if not case.get('docket'):
            continue
def identify_judge_entriesv2(jfhandle=None, docket=None):
    '''
    Attributes each docket entry to a judge.
    V2 -- Spacy langauge model to identify judge names on per entry basis. Entries without attribution are done via
          a forward and then backwards backfilling routine, assigning the closest attributed judge to the entry
          Returns [judge name, judge name, ... ] that will be the same length as the inputed docket
    input:
       * jfhandle -- str (opt), Filename for the json file
       * docket -- list (opt), default is None. Will supercede the json file
    output:
       * judge_ind_entries -- list, [judge name, judge name, ...]
    '''
    import json
    import re
    import spacy
    import sys
    sys.path.append('..')
    import support.data_tools as dtools
    import support.settings as settings

    exclusions = [
        'EXECUTIVE COMMITTEE', 'Executive Committee', 'executive committee',
        'GENERAL', 'General', 'general'
    ]

    #Input check
    if jfhandle == None and docket == None:
        print(
            'INPUT ERROR: No docket or json filehandle provided. Returning None'
        )
        return None

    #Load the data
    if not docket:
        case = dtools.load_case(jfhandle)
        docket = case['docket']
예제 #8
0
                    the case (fpath,case_type and all idb_recap columns needed)
        - indent (int): size of indent if pretty printing
    '''
    # Actually fix incorrect data
    if row.recap:
        # Get the case and update the idb_data key
        case = dtools.load_case(row.fpath, recap_orig=True)
        case['idb_data'] = extract_recap_idb_data(row, row.case_type)

        # Update the outer json with idb_data
        for key in ['date_filed', 'date_terminated', 'nature_of_suit']:
            case[key] = case['idb_data'][key]

    else:
        # Pacer: just add in data
        case = dtools.load_case(row.fpath)
        case['idb_data'] = extract_recap_idb_data(row, row.case_type)

    with open(settings.PROJECT_ROOT / row.fpath, 'w+',
              encoding='utf-8') as wfile:
        simplejson.dump(case, wfile, ignore_nan=True, indent=indent)


def execute_idb_merge(merged_df):
    '''
    Update casefiles from an idb merge

    Inputs
        - merged_df (DataFrame): a merged dataframe, output from idb_merge
    '''
예제 #9
0
def bundler(indf, name, notes=None, overwrite=False, anno_col=None):
    '''
    Bundle up a collection of files
    Inputs:
        - indf (DataFrame): any dataframe with an fpath column to identify files
        - name (str): name of directory to bundle into (will be put in /data/{name})
        - notes (str): notes to be injected under the header (html string)
        - anno_col (str): name of annotations column if any, column should be valid json string
    '''
    df = indf.copy()
    # Want to include the index if it's ucid
    if df.index.name == 'ucid':
        df = df.reset_index()

    if anno_col:
        # import pdb;pdb.set_trace()
        df[anno_col] = df[anno_col].map(json.loads)

    # Columns needed to generate
    if 'fpath' not in df.columns:
        raise ValueError(
            'DataFrame must include fpath column to point to file locations')
    elif 'ucid' not in df.columns:
        raise ValueError('DataFrame must include ucid to identify case')

    # Handle directory
    bundle_dir = settings.BUNDLES / name
    if bundle_dir.exists():
        if overwrite:
            # Delete all files in the directory
            for file in bundle_dir.iterdir():
                file.unlink()
        else:
            raise ValueError(f'The directory {str(bundle_dir)} already exists')
    else:
        bundle_dir.mkdir()

    # Start building html index page with strings
    heading = f"<h1 class='heading'>Data Dump: {name}</h1>"
    notes = f'''<div class="notes">NOTES: {notes}</div>''' if notes else ''
    opening = f"<html>{index_style()}<body>{heading}{notes}"

    # Start building table rows
    table_rows = []
    header = [f"<th>{val}</th>" for val in df.columns if val != anno_col]
    table_rows.append("".join(header))

    for i, row in tqdm(df.iterrows(), total=len(df)):
        # Get filepath
        rel_path = row.fpath
        if type(rel_path) is str:
            rel_path = Path(rel_path.replace('\\', '/'))
        abs_path = settings.PROJECT_ROOT / rel_path

        # Annotation scenario
        if 'pacer' in abs_path.parts and anno_col and row[anno_col]:
            # Load the html text and json data to make the annotated docket
            hpath = dtools.get_pacer_html(abs_path)
            html_text = open(hpath, 'r', encoding='utf-8').read()
            json_data = dtools.load_case(row.fpath)
            new_html = make_annotated_docket(html_text, json_data,
                                             row[anno_col])

            # Copy the new (annotated) html into the bundle directory
            tqdm.write(f"Annotating {row.ucid}")
            new_name = row.ucid.replace(':', '-') + '.html'
            with open(bundle_dir / new_name, 'w', encoding='utf-8') as wfile:
                wfile.write(new_html)

        else:
            if 'pacer' in abs_path.parts:
                # Get the path to the html file
                abs_path = dtools.get_pacer_html(abs_path)

            # Copy the file
            tqdm.write(f"Copying {row.ucid}")
            new_name = row.ucid.replace(':', '-') + abs_path.suffix
            shutil.copyfile(abs_path, bundle_dir / new_name)

        cells = [f"<td>{v}</td>" for k, v in row.iteritems() if k != anno_col]
        row_string = f'''<tr onclick="window.open('{new_name}')">''' + "".join(
            cells) + "</tr>"
        table_rows.append(row_string)

    # Finish out the html string for the index
    table = f"<table class='maintable'>{''.join(table_rows)}</table>"
    closing = f"</body></html>"
    html = opening + table + closing

    with open(bundle_dir / '_index.html', 'w+') as wfile:
        wfile.write(html)

    print(f"\nFiles Succesfully bundled into {bundle_dir}")