def upload_recap_json(self, pk): """Make a JSON object for a RECAP docket and upload it to IA""" d, json_str = generate_ia_json(pk) file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json') bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) responses = upload_to_ia( self, identifier=bucket_name, files={file_name: StringIO(json_str)}, title=best_case_name(d), collection=settings.IA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % d.get_absolute_url(), media_type='texts', description="This item represents a case in PACER, the U.S. " "Government's website for federal case data. This " "information is uploaded quarterly. To see our most " "recent version please use the source url parameter, " "linked below. To see the canonical source for this data, " "please consult PACER directly.", ) if responses is None: increment_failure_count(d) return if all(r.ok for r in responses): d.ia_upload_failure_count = None d.ia_date_first_changed = None d.ia_needs_upload = False d.filepath_ia_json = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) d.save() else: increment_failure_count(d)
def make_pdf_path(instance, filename, thumbs=False): from cl.search.models import ClaimHistory, RECAPDocument from cl.lasc.models import LASCPDF if type(instance) == RECAPDocument: root = "recap" court_id = instance.docket_entry.docket.court_id pacer_case_id = instance.docket_entry.docket.pacer_case_id elif type(instance) == ClaimHistory: root = "claim" court_id = instance.claim.docket.court_id pacer_case_id = instance.pacer_case_id elif type(instance) == LASCPDF: slug = slugify(trunc(filename, 40)) root = "/us/state/ca/lasc/%s/" % instance.docket_number file_name = "gov.ca.lasc.%s.%s.%s.pdf" % ( instance.docket_number, instance.document_id, slug, ) return os.path.join(root, file_name) else: raise ValueError("Unknown model type in make_pdf_path " "function: %s" % type(instance)) if thumbs: root = root + "-thumbnails" return os.path.join(root, get_bucket_name(court_id, pacer_case_id), filename)
def upload_pdf_to_ia(self, rd_pk): rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket file_name = get_document_filename( d.court_id, d.pacer_case_id, rd.document_number, rd.attachment_number or 0, ) bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) responses = upload_to_ia( self, identifier=bucket_name, files=rd.filepath_local.path, title=best_case_name(d), collection=settings.IA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % rd.get_absolute_url(), media_type='texts', description="This item represents a case in PACER, the U.S. " "Government's website for federal case data. If you wish " "to see the entire case, please consult PACER directly.", ) if responses is None: increment_failure_count(rd) return if all(r.ok for r in responses): rd.ia_upload_failure_count = None rd.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name, file_name) rd.save() else: increment_failure_count(rd)
def upload_audio_to_ia(self, af_pk): af = Audio.objects.get(pk=af_pk) d = af.docket file_name = make_af_filename( d.court_id, d.docket_number, d.date_argued, af.local_path_original_file.path.rsplit('.', 1)[1]) bucket_name = get_bucket_name(d.court_id, slugify(d.docket_number)) responses = upload_to_ia( self, identifier=bucket_name, files={file_name: af.local_path_original_file.path}, title=best_case_name(d), collection=settings.IA_OA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % af.get_absolute_url(), media_type='audio', description='This item represents an oral argument audio file as ' 'scraped from a U.S. Government website by Free Law ' 'Project.', ) if responses is None: increment_failure_count(af) return if all(r.ok for r in responses): af.ia_upload_failure_count = None af.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name, file_name) af.save() else: increment_failure_count(af)
def upload_free_opinion_to_ia(self, rd_pk): rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket file_name = get_document_filename( d.court_id, d.pacer_case_id, rd.document_number, 0, # Attachment number is zero for all free opinions. ) bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) try: responses = upload_to_ia( identifier=bucket_name, files=rd.filepath_local.path, metadata={ 'title': best_case_name(d), 'collection': settings.IA_COLLECTIONS, 'contributor': '<a href="https://free.law">Free Law Project</a>', 'court': d.court_id, 'language': 'eng', 'mediatype': 'texts', 'description': "This item represents a case in PACER, " "the U.S. Government's website for " "federal case data. If you wish to see " "the entire case, please consult PACER " "directly.", 'licenseurl': 'https://www.usa.gov/government-works', }, ) except (OverloadedException, ExpatError) as exc: # Overloaded: IA wants us to slow down. # ExpatError: The syntax of the XML file that's supposed to be returned # by IA is bad (or something). if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_403_FORBIDDEN, # Can't access bucket, typically. HTTP_400_BAD_REQUEST, # Corrupt PDF, typically. ]: return [exc.response] if self.request.retries == self.max_retries: # This exception is also raised when the endpoint is overloaded, but # doesn't get caught in the OverloadedException below due to # multiple processes running at the same time. Just give up for now. return raise self.retry(exc=exc) except (requests.Timeout, requests.RequestException) as exc: logger.warning("Timeout or unknown RequestException. Unable to upload " "to IA. Trying again if retries not exceeded: %s" % rd) if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) if all(r.ok for r in responses): rd.filepath_ia = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) rd.save(do_extraction=False, index=False)
def upload_free_opinion_to_ia(self, rd_pk): rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket file_name = get_document_filename( d.court_id, d.pacer_case_id, rd.document_number, 0, # Attachment number is zero for all free opinions. ) bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) try: responses = upload_to_ia( identifier=bucket_name, files=rd.filepath_local.path, metadata={ 'title': best_case_name(d), 'collection': settings.IA_COLLECTIONS, 'contributor': '<a href="https://free.law">Free Law Project</a>', 'court': d.court_id, 'language': 'eng', 'mediatype': 'texts', 'description': "This item represents a case in PACER, " "the U.S. Government's website for " "federal case data. If you wish to see " "the entire case, please consult PACER " "directly.", 'licenseurl': 'https://www.usa.gov/government-works', }, ) except (OverloadedException, ExpatError) as exc: # Overloaded: IA wants us to slow down. # ExpatError: The syntax of the XML file that's supposed to be returned # by IA is bad (or something). if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_403_FORBIDDEN, # Can't access bucket, typically. HTTP_400_BAD_REQUEST, # Corrupt PDF, typically. ]: return [exc.response] if self.request.retries == self.max_retries: # This exception is also raised when the endpoint is overloaded, but # doesn't get caught in the OverloadedException below due to # multiple processes running at the same time. Just give up for now. return raise self.retry(exc=exc) except (requests.Timeout, requests.RequestException) as exc: logger.warning("Timeout or unknown RequestException. Unable to upload " "to IA. Trying again if retries not exceeded: %s" % rd) if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) if all(r.ok for r in responses): rd.filepath_ia = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) rd.save(do_extraction=False, index=False)
def upload_recap_json(self, pk): """Make a JSON object for a RECAP docket and upload it to IA""" # This is a pretty highly optimized query that uses only 13 hits to the DB # when generating a docket JSON rendering, regardless of how many related # objects the docket has such as docket entries, parties, etc. ds = Docket.objects.filter(pk=pk).select_related( 'originating_court_information', ).prefetch_related( 'panel', 'parties__attorneys__roles', 'parties__party_types__criminal_complaints', 'parties__party_types__criminal_counts', # Django appears to have a bug where you can't defer a field on a # queryset where you prefetch the values. If you try to, it crashes. # We should be able to just do the prefetch below like the ones above # and then do the defer statement at the end, but that throws an error. Prefetch( 'docket_entries__recap_documents', queryset=RECAPDocument.objects.all().defer('plain_text') ) ) d = ds[0] renderer = JSONRenderer() json_str = renderer.render( IADocketSerializer(d).data, accepted_media_type='application/json; indent=2', ) file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json') bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) responses = upload_to_ia( self, identifier=bucket_name, files={file_name: StringIO(json_str)}, title=best_case_name(d), collection=settings.IA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % d.get_absolute_url(), media_type='texts', description="This item represents a case in PACER, the U.S. " "Government's website for federal case data. This " "information is uploaded quarterly. To see our most " "recent version please use the source url parameter, " "linked below. To see the canonical source for this data, " "please consult PACER directly.", ) if responses is None: increment_failure_count(d) return if all(r.ok for r in responses): d.ia_upload_failure_count = None d.ia_date_first_changed = None d.filepath_ia_json = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) mark_ia_upload_needed(d) d.save() else: increment_failure_count(d)
def base_recap_path(instance, filename, base_dir): """Make a filepath, accepting an extra parameter for the base directory Mirrors technique used by original RECAP server to upload PDFs to IA. """ return os.path.join( base_dir, get_bucket_name( instance.docket_entry.docket.court_id, instance.docket_entry.docket.pacer_case_id, ), filename, )
def base_recap_path(instance, filename, base_dir): """Make a filepath, accepting an extra parameter for the base directory Mirrors technique used by original RECAP server to upload PDFs to IA. """ return os.path.join( base_dir, get_bucket_name( instance.docket_entry.docket.court_id, instance.docket_entry.docket.pacer_case_id, ), filename, )
def make_recap_pdf_path(instance, filename): """Make a path for storing the a PACER document in RECAP. Mirrors technique used by original RECAP server to upload PDFs to IA. """ return os.path.join( "recap", get_bucket_name( instance.docket_entry.docket.court_id, instance.docket_entry.docket.pacer_case_id, ), filename, )
def make_recap_pdf_path(instance, filename): """Make a path for storing the a PACER document in RECAP. Mirrors technique used by original RECAP server to upload PDFs to IA. """ return os.path.join( "recap", get_bucket_name( instance.docket_entry.docket.court_id, instance.docket_entry.docket.pacer_case_id, ), filename, )