def save(self): """Save uploaded Tennessee Workers Comp/Appeal to db. :return: Cluster """ sha1_hash = sha1(force_bytes(self.cleaned_data.get("pdf_upload"))) court = Court.objects.get(pk=self.cleaned_data.get("court_str")) docket, opinion, cluster, citations = make_objects( self.cleaned_data.get("item"), court, sha1_hash, self.cleaned_data.get("pdf_upload"), ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, ocr_available=True, citation_jitter=True ) logging.info( "Successfully added Tennessee object cluster: %s", cluster.id ) return cluster
pdf_data, ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logging.info("Successfully added Tennessee object cluster: %s", cluster.id) class Command(VerboseCommand): help = "Import TN data corpus received from TN Workers Comp boards." def add_arguments(self, parser): parser.add_argument( "--input-file", type=argparse.FileType("r"), help="The filepath to our preprocessed data file.", required=True,
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content(item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = { 'lookup_value': item['download_urls'], 'lookup_by': 'download_url' } else: lookup_params = { 'lookup_value': sha1_hash, 'lookup_by': 'sha1' } onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content) if error: download_error = True continue self.save_everything(items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False, ocr_available=True): # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) if dup_checker.abort_by_url_hash(site.url, site.hash): return if site.cookies: logger.info(f"Using cookies: {site.cookies}") for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, method=site.method, ) if msg: logger.warning(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if ( court_str == "nev" and item["precedential_statuses"] == "Unpublished" ): # Nevada's non-precedential cases have different SHA1 sums # every time. lookup_params = { "lookup_value": item["download_urls"], "lookup_by": "download_url", } else: lookup_params = { "lookup_value": sha1_hash, "lookup_by": "sha1", } proceed = dup_checker.press_on( Opinion, current_date, next_date, **lookup_params ) if dup_checker.emulate_break: break if not proceed: continue # Not a duplicate, carry on logger.info( f"Adding new document found at: {item['download_urls'].encode()}" ) dup_checker.reset() docket, opinion, cluster, citations = make_objects( item, court, sha1_hash, content ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, ocr_available=ocr_available, citation_jitter=True ) logger.info( f"Successfully added doc {opinion.pk}: {item['case_names'].encode()}" ) # Update the hash if everything finishes properly. logger.info(f"{site.court_id}: Successfully crawled opinions.") if not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
pdf_data, ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, ocr_available=ocr_available, citation_jitter=True, ) logging.info("Successfully added Tennessee object cluster: %s", cluster.id) class Command(VerboseCommand): help = "Import TN data corpus received from TN Workers Comp boards." def add_arguments(self, parser): parser.add_argument( "--input-file", type=argparse.FileType("r"), help="The filepath to our preprocessed data file.", required=True,
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest() if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = {'lookup_value': item['download_urls'], 'lookup_by': 'download_url'} else: lookup_params = {'lookup_value': sha1_hash, 'lookup_by': 'sha1'} onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content ) if error: download_error = True continue self.save_everything( items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False ) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)