def import_law_box_case(case_path): raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text) citations = get_citations_from_tree(complete_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path) dates = get_date_filed(clean_html_tree, citations, case_path=case_path, court=court) if not dates and ('review_issues' in DEBUG or 'log_bad_values' in DEBUG): if 'review_issues' in DEBUG: subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() raw_input("No date identified! Can we fix this and restart, or just press enter to log it? ") if 'log_bad_values' in DEBUG: # Write the failed case out to file. with open('missing_dates_post_focus.txt', 'a') as out: out.write('%s\n' % case_path)
def import_law_box_case(case_path): raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) citations = get_citations_from_tree(complete_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path) dates = get_date_filed(clean_html_tree, citations, case_path=case_path, court=court) if not dates and ('review_issues' in DEBUG or 'log_bad_values' in DEBUG): if 'review_issues' in DEBUG: subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() raw_input( "No date identified! Can we fix this and restart, or just press enter to log it? " ) if 'log_bad_values' in DEBUG: # Write the failed case out to file. with open('missing_dates_post_focus.txt', 'a') as out: out.write('%s\n' % case_path)
def cleaner(simulate=False, verbose=False): """Find items that: - Contain the word "argued" - Occur between 2002-01-01 and 2031-12-31 - Are precedential - Have a source == L. - Match a regex for the funky date pattern """ conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode="rw") q = { "q": "argued", "fl": "id,text,source", "fq": [ "dateFiled:[2002-01-01T00:00:00Z TO 2031-12-31T00:00:00Z]", 'status_exact:("Precedential")', ], "sort": "dateFiled asc", "caller": "cleanup_script", } results = conn.raw_query(**q) for r in results: if verbose: print "Running tests on item %s" % r["id"] # We iterate over the search results. For each one, we run tests on it to see if it needs a fix. # If so, we get the record from the database and update it. If not, re continue. if r["source"] != "L": # Only affects pure Lawbox cases. Merged cases did not have their date updated. if verbose: print " - Source is %s. Punting." % r["source"] continue re_match = re.search("Argued.{1,12}\d{1,2}-\d{1,2}, \d{4}", r["text"]) if not re_match: # Lacks the affronting line. Onwards. if verbose: print " - Lacks the bad date string. Punting." continue if verbose: print " - All tests pass. This item may be modified. (Simulate is: %s)" % simulate doc = Document.objects.get(pk=r["id"]) clean_html_tree = html.fromstring(doc.html_lawbox) new_date = get_date_filed(clean_html_tree, citations=[]).date() if verbose: print " - https://www.courtlistener.com%s" % doc.get_absolute_url() print " - Old date was: %s" % doc.date_filed print " - New date is: %s" % new_date if new_date == doc.date_filed: # No change needed, simply move on. if verbose: print " - Dates are equal: Proceeding." continue else: if verbose: print " - Updating with new date." if not simulate: doc.date_filed = new_date doc.save(index=True, force_commit=False) # Do one big commit at the end conn.commit()
def cleaner(simulate=False, verbose=False): """Find items that: - Contain the word "argued" - Occur between 2002-01-01 and 2031-12-31 - Are precedential - Have a source == L. - Match a regex for the funky date pattern """ conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') q = { 'q': 'argued', 'fl': 'id,text,source', 'fq': [ 'dateFiled:[2002-01-01T00:00:00Z TO 2031-12-31T00:00:00Z]', 'status_exact:("Precedential")', ], 'sort': 'dateFiled asc', 'caller': 'cleanup_script', } results = conn.raw_query(**q) for r in results: if verbose: print "Running tests on item %s" % r['id'] # We iterate over the search results. For each one, we run tests on it to see if it needs a fix. # If so, we get the record from the database and update it. If not, re continue. if r['source'] != 'L': # Only affects pure Lawbox cases. Merged cases did not have their date updated. if verbose: print " - Source is %s. Punting." % r['source'] continue re_match = re.search('Argued.{1,12}\d{1,2}-\d{1,2}, \d{4}', r['text']) if not re_match: # Lacks the affronting line. Onwards. if verbose: print " - Lacks the bad date string. Punting." continue if verbose: print " - All tests pass. This item may be modified. (Simulate is: %s)" % simulate doc = Document.objects.get(pk=r['id']) clean_html_tree = html.fromstring(doc.html_lawbox) new_date = get_date_filed(clean_html_tree, citations=[]).date() if verbose: print " - https://www.courtlistener.com%s" % doc.get_absolute_url() print " - Old date was: %s" % doc.date_filed print " - New date is: %s" % new_date if new_date == doc.date_filed: # No change needed, simply move on. if verbose: print " - Dates are equal: Proceeding." continue else: if verbose: print " - Updating with new date." if not simulate: doc.date_filed = new_date doc.save(index=True, force_commit=False) # Do one big commit at the end conn.commit()