def import_f3(): """Iterate over the F3 documents and import them. A couple complications belie what would otherwise be a simple process: 1. Duplicate detection. This is done by filtering by query and then refining the results that are found. For more details, see the dup_finder code. 2. Merging duplicate documents. See their code in the f3_helpers module. """ simulate = False corpus = dup_helpers.Corpus("%s/Resource.org/data/F3/" % settings.INSTALL_ROOT) vol_file = open("%s/Resource.org/logs/vol_file.txt" % settings.INSTALL_ROOT, "r+") case_file = open("%s/Resource.org/logs/case_file.txt" % settings.INSTALL_ROOT, "r+") stat_file = open("%s/Resource.org/logs/training_stats.csv" % settings.INSTALL_ROOT, "a") try: volume_num = int(vol_file.readline()) except ValueError: # the volume file is emtpy or otherwise failing. volume_num = 0 vol_file.close() for volume in corpus[volume_num:]: print "################" print " Vol: %s" % volume_num print "################" try: j = int(case_file.readline()) print "Case: %s" % j except ValueError: j = 0 case_file.close() for case in volume[j:]: if dup_helpers.need_dup_check_for_date_and_court(case): run_dup_check(case, simulate) else: print "Dup check not needed. Adding the opinion." if not simulate: dup_helpers.add_case(case) # save our location within the volume j += 1 case_file = open("%s/Resource.org/logs/case_file.txt" % settings.INSTALL_ROOT, "w") case_file.write(str(j)) case_file.close() # save our location within the corpus volume_num += 1 vol_file = open("%s/Resource.org/logs/vol_file.txt" % settings.INSTALL_ROOT, "w") vol_file.write(str(volume_num)) vol_file.close()
def run_dup_check(case, simulate=True): """Runs a series of duplicate checking code, generating and analyzing stats about whether the case is a duplicate. """ print "Running dup check..." # stats takes the form: [count_from_search] or # [count_from_search, # count_from_docket_num, # [case_name_diff_1, diff_2, diff_3, etc], # [content_length_percent_diff_1, 2, 3], # [content_diff_1, 2, 3] # ] # candidates is a list of 0 to n possible duplicates stats, candidates = dup_finder.get_dup_stats(case) if len(candidates) == 0: print " No candidates found. Adding the opinion." if not simulate: dup_helpers.add_case(case) elif (re.sub("(\D|0)", "", case.docket_number) == re.sub("(\D|0)", "", candidates[0]["docketNumber"])) and ( len(candidates) == 1 ): # If the docket numbers are identical, and there was only # one result print " Match made on docket number of single candidate. Merging the " "opinions." if not simulate: dup_helpers.merge_cases_simple(case, candidates[0]["id"]) elif len(dup_helpers.find_same_docket_numbers(case, candidates)) == 1: print " One of the %s candidates had an identical docket number. " "Merging the opinions." % len(candidates) if not simulate: dup_helpers.merge_cases_simple(case, dup_helpers.find_same_docket_numbers(case, candidates)[0]["id"]) elif len(dup_helpers.find_same_docket_numbers(case, candidates)) > 0: print " Several of the %s candidates had an identical docket " "number. Merging the opinions." % len( candidates ) if not simulate: target_ids = [can["id"] for can in dup_helpers.find_same_docket_numbers(case, candidates)] dup_helpers.merge_cases_complex(case, target_ids) else: # Possible duplicate, filter out obviously bad cases, and # then pass forward for manual review if necessary. filtered_candidates, stats = dup_helpers.filter_by_stats(candidates, stats) if len(filtered_candidates) == 0: print "After filtering, no candidates remain. Adding the opinion." if not simulate: dup_helpers.add_case(case) else: print "FILTERED STATS: %s" % stats duplicates = [] for k in range(0, len(filtered_candidates)): # Have to determine by "hand" print " %s) Case name: %s" % (k + 1, case.case_name) print " %s" % filtered_candidates[k]["caseName"] print " Docket nums: %s" % case.docket_number print " %s" % filtered_candidates[k]["docketNumber"] print " Candidate URL: %s" % case.download_url print " Match URL: https://www.courtlistener.com%s" % (filtered_candidates[k]["absolute_url"]) choice = raw_input("Is this a duplicate? [Y/n]: ") choice = choice or "y" if choice == "y": duplicates.append(filtered_candidates[k]["id"]) if len(duplicates) == 0: print "No duplicates found after manual determination. " "Adding the opinion." if not simulate: dup_helpers.add_case(case) elif len(duplicates) == 1: print "Single duplicate found after manual determination. " "Merging the opinions." if not simulate: dup_helpers.merge_cases_simple(case, duplicates[0]) elif len(duplicates) > 1: print "Multiple duplicates found after manual determination. " "Merging the opinions." if not simulate: dup_helpers.merge_cases_complex(case, duplicates)