def merge_cases_simple(new, target_id): """Add `new` to the database, merging with target_id Merging is done by picking the best fields from each item. """ target = Document.objects.get(pk=target_id) print "Merging %s with" % new.citation.case_name print " %s" % target.citation.case_name cached_source = target.source # Original value is needed below. if target.source == 'C': target.source = 'LC' elif target.source == 'R': target.source = 'LR' elif target.source == 'CR': target.source = 'LCR' # Add the URL if it's not a court one, replacing resource.org's info in some cases. if cached_source == 'R': target.download_url = new.download_url # Recreate the slug from the new case name (this changes the URL, but the old will continue working) target.citation.slug = trunc(slugify(new.citation.case_name), 50) # Take the case name from the new item; they tend to be pretty good target.citation.case_name = new.citation.case_name # Add the docket number if the old doesn't exist, but keep the old if one does. if not target.citation.docket_number: target.citation.docket_number = new.citation.docket_number # Get the citations from the new item (ditch the old). target.citation.federal_cite_one = new.citation.federal_cite_one target.citation.federal_cite_two = new.citation.federal_cite_two target.citation.federal_cite_three = new.citation.federal_cite_three target.citation.state_cite_one = new.citation.state_cite_one target.citation.state_cite_two = new.citation.state_cite_two target.citation.state_cite_three = new.citation.state_cite_three target.citation.state_cite_regional = new.citation.state_cite_regional target.citation.specialty_cite_one = new.citation.specialty_cite_one target.citation.scotus_early_cite = new.citation.scotus_early_cite target.citation.lexis_cite = new.citation.lexis_cite target.citation.westlaw_cite = new.citation.westlaw_cite target.citation.neutral_cite = new.citation.neutral_cite # Add judge information if lacking. New is dirty, but better than none. if not target.judges: target.judges = new.judges # Add the text. target.html_lawbox, blocked = anonymize(new.html) if blocked: target.blocked = True target.date_blocked = now() target.extracted_by_ocr = False # No longer true for any LB case. save_doc_and_cite(target, index=False)
def main(): parser = argparse.ArgumentParser(description="Import the corpus provided by lawbox") parser.add_argument( "-s", "--simulate", default=False, required=False, action="store_true", help="Run the code in simulate mode, making no permanent changes.", ) parser.add_argument("-d", "--dir", type=readable_dir, help="The directory where the lawbox bulk data can be found.") parser.add_argument( "-f", "--file", type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.", ) parser.add_argument( "-l", "--line", type=int, default=1, required=False, help="If provided, this will be the line number in the index file where we resume processing.", ) parser.add_argument( "-r", "--resume", default=False, required=False, action="store_true", help="Use the saved marker to resume operation where it last failed.", ) parser.add_argument( "-x", "--random", default=False, required=False, action="store_true", help="Pick cases randomly rather than serially.", ) parser.add_argument( "-m", "--marker", type=str, default="lawbox_progress_marker.txt", required=False, help="The name of the file that tracks the progress (useful if multiple versions run at same time)", ) parser.add_argument( "-e", "--end", type=int, required=False, default=2000000, help="An optional endpoint for an importer." ) args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, "*"): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if "counter" in DEBUG: # and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = "" if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done when we find citations) save_doc_and_cite(doc, index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: # complex_merge if "log_multimerge" in DEBUG: with open("index_multimerge.txt", "a") as log: log.write("%s\n" % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode. with open(args.marker, "w") as marker: marker.write(str(i + 1)) # Files are one-index, not zero-index with open("lawbox_fix_file.pkl", "wb") as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print("Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)
def main(): parser = argparse.ArgumentParser( description='Import the corpus provided by lawbox') parser.add_argument( '-s', '--simulate', default=False, required=False, action='store_true', help='Run the code in simulate mode, making no permanent changes.') parser.add_argument( '-d', '--dir', type=readable_dir, help='The directory where the lawbox bulk data can be found.') parser.add_argument( '-f', '--file', type=str, default="index.txt", required=False, dest="file_name", help="The file that has all the URLs to import, one per line.") parser.add_argument( '-l', '--line', type=int, default=1, required=False, help= 'If provided, this will be the line number in the index file where we resume processing.' ) parser.add_argument( '-r', '--resume', default=False, required=False, action='store_true', help='Use the saved marker to resume operation where it last failed.') parser.add_argument('-x', '--random', default=False, required=False, action='store_true', help='Pick cases randomly rather than serially.') parser.add_argument( '-m', '--marker', type=str, default='lawbox_progress_marker.txt', required=False, help= "The name of the file that tracks the progress (useful if multiple versions run at same time)" ) parser.add_argument('-e', '--end', type=int, required=False, default=2000000, help="An optional endpoint for an importer.") args = parser.parse_args() if args.dir: def case_generator(dir_root): """Yield cases, one by one to the importer by recursing and iterating the import directory""" for root, dirnames, filenames in os.walk(dir_root): for filename in fnmatch.filter(filenames, '*'): yield os.path.join(root, filename) cases = case_generator(args.root) i = 0 else: def generate_random_line(file_name): while True: total_bytes = os.stat(file_name).st_size random_point = random.randint(0, total_bytes) f = open(file_name) f.seek(random_point) f.readline() # skip this line to clear the partial line yield f.readline().strip() def case_generator(line_number): """Yield cases from the index file.""" enumerated_line_number = line_number - 1 # The enumeration is zero-index, but files are one-index. index_file = open(args.file_name) for i, line in enumerate(index_file): if i >= enumerated_line_number: yield line.strip() if args.random: cases = generate_random_line(args.file_name) i = 0 elif args.resume: with open(args.marker) as marker: resume_point = int(marker.read().strip()) cases = case_generator(resume_point) i = resume_point else: cases = case_generator(args.line) i = args.line for case_path in cases: if i % 1000 == 0: db.reset_queries() # Else we leak memory when DEBUG is True if 'counter' in DEBUG: #and i % 1000 == 0: log_print("\n%s: Doing case (%s): file://%s" % (datetime.datetime.now(), i, case_path)) try: doc = import_law_box_case(case_path) duplicates = find_duplicates(doc, case_path) if not args.simulate: if len(duplicates) == 0: doc.html_lawbox, blocked = anonymize(doc.html) doc.html = '' if blocked: doc.blocked = True doc.date_blocked = now() # Save nothing to the index for now (it'll get done when we find citations) save_doc_and_cite(doc, index=False) if len(duplicates) == 1: dup_helpers.merge_cases_simple(doc, duplicates[0]) if len(duplicates) > 1: #complex_merge if 'log_multimerge' in DEBUG: with open('index_multimerge.txt', 'a') as log: log.write('%s\n' % case_path) if args.resume: # Don't change the progress marker unless you're in resume mode. with open(args.marker, 'w') as marker: marker.write(str(i + 1)) # Files are one-index, not zero-index with open('lawbox_fix_file.pkl', 'wb') as fix_file: pickle.dump(fixes, fix_file) i += 1 if i == args.end: log_print( "Hit the endpoint after importing number %s. Breaking." % i) break except Exception, err: log_print(traceback.format_exc()) exit(1)