Пример #1
0
def serve_static_file(request, file_path=''):
    """Sends a static file to a user.

    This serves up the static case files such as the PDFs in a way that can be
    blocked from search engines if necessary. We do four things:
     - Look up the document  or audio file associated with the filepath
     - Check if it's blocked
     - If blocked, we set the x-robots-tag HTTP header
     - Serve up the file using Apache2's xsendfile
    """
    response = HttpResponse()
    file_loc = os.path.join(settings.MEDIA_ROOT, file_path.encode('utf-8'))
    if file_path.startswith('mp3'):
        item = get_object_or_404(Audio, local_path_mp3=file_path)
        mimetype = 'audio/mpeg'
    elif file_path.startswith('recap'):
        # Either we serve up a special HTML file to make open graph crawlers
        # happy, or we serve the PDF to make a human happy.
        og_disabled = bool(request.GET.get('no-og'))
        if is_og_bot(request) and not og_disabled:
            # Serve up the regular HTML page, which has the twitter card info.
            try:
                rd = RECAPDocument.objects.get(filepath_local=file_path)
            except (RECAPDocument.DoesNotExist,
                    RECAPDocument.MultipleObjectsReturned):
                pass
            else:
                return view_recap_document(
                    request,
                    docket_id=rd.docket_entry.docket_id,
                    doc_num=rd.document_number,
                    att_num=rd.attachment_number
                )
        # A human or unable to find the item in the DB. Create an empty object,
        # and set it to blocked. (All recap pdfs are blocked.)
        item = RECAPDocument()
        item.blocked = True
        mimetype = 'application/pdf'
    else:
        item = get_object_or_404(Opinion, local_path=file_path)
        item.blocked = item.cluster.blocked
        try:
            mimetype = magic.from_file(file_loc, mime=True)
        except IOError:
            raise Http404

    if item.blocked:
        response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'

    if settings.DEVELOPMENT:
        # X-Sendfile will only confuse you in a dev env.
        response.content = open(file_loc, 'r').read()
    else:
        response['X-Sendfile'] = file_loc
    file_name = file_path.split('/')[-1]
    response['Content-Disposition'] = 'inline; filename="%s"' % \
                                      file_name.encode('utf-8')
    response['Content-Type'] = mimetype
    return response
Пример #2
0
def serve_static_file(request, file_path=''):
    """Sends a static file to a user.

    This serves up the static case files such as the PDFs in a way that can be
    blocked from search engines if necessary. We do four things:
     - Look up the document  or audio file associated with the filepath
     - Check if it's blocked
     - If blocked, we set the x-robots-tag HTTP header
     - Serve up the file using Apache2's xsendfile
    """
    response = HttpResponse()
    file_loc = os.path.join(settings.MEDIA_ROOT, file_path.encode('utf-8'))
    if file_path.startswith('mp3'):
        item = get_object_or_404(Audio, local_path_mp3=file_path)
        mimetype = 'audio/mpeg'
    elif file_path.startswith('recap'):
        # Either we serve up a special HTML file to make open graph crawlers
        # happy, or we serve the PDF to make a human happy.
        og_disabled = bool(request.GET.get('no-og'))
        if is_og_bot(request) and not og_disabled:
            # Serve up the regular HTML page, which has the twitter card info.
            try:
                rd = RECAPDocument.objects.get(filepath_local=file_path)
            except (RECAPDocument.DoesNotExist,
                    RECAPDocument.MultipleObjectsReturned):
                pass
            else:
                return view_recap_document(
                    request,
                    docket_id=rd.docket_entry.docket_id,
                    doc_num=rd.document_number,
                    att_num=rd.attachment_number
                )
        # A human or unable to find the item in the DB. Create an empty object,
        # and set it to blocked. (All recap pdfs are blocked.)
        item = RECAPDocument()
        item.blocked = True
        mimetype = 'application/pdf'
    else:
        item = get_object_or_404(Opinion, local_path=file_path)
        item.blocked = item.cluster.blocked
        try:
            mimetype = magic.from_file(file_loc, mime=True)
        except IOError:
            raise Http404

    if item.blocked:
        response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'

    if settings.DEVELOPMENT:
        # X-Sendfile will only confuse you in a dev env.
        response.content = open(file_loc, 'r').read()
    else:
        response['X-Sendfile'] = file_loc
    file_name = file_path.split('/')[-1]
    response['Content-Disposition'] = 'inline; filename="%s"' % \
                                      file_name.encode('utf-8')
    response['Content-Type'] = mimetype
    return response
Пример #3
0
def serve_static_file(request, file_path=''):
    """Sends a static file to a user.

    This serves up the static case files such as the PDFs in a way that can be
    blocked from search engines if necessary. We do four things:
     - Look up the document  or audio file associated with the filepath
     - Check if it's blocked
     - If blocked, we set the x-robots-tag HTTP header
     - Serve up the file using Apache2's xsendfile
    """
    response = HttpResponse()
    file_loc = os.path.join(settings.MEDIA_ROOT, file_path.encode('utf-8'))
    if file_path.startswith('mp3'):
        item = get_object_or_404(Audio, local_path_mp3=file_path)
        mimetype = 'audio/mpeg'
    elif file_path.startswith('recap'):
        # Create an empty object, and set it to blocked. No need to hit the DB
        # since all RECAP documents are blocked.
        item = RECAPDocument()
        item.blocked = True
        mimetype = 'application/pdf'
    else:
        item = get_object_or_404(Opinion, local_path=file_path)
        item.blocked = item.cluster.blocked
        try:
            mimetype = magic.from_file(file_loc, mime=True)
        except IOError:
            raise Http404

    if item.blocked:
        response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'

    if settings.DEVELOPMENT:
        # X-Sendfile will only confuse you in a dev env.
        response.content = open(file_loc, 'r').read()
    else:
        response['X-Sendfile'] = file_loc
    file_name = file_path.split('/')[-1]
    response['Content-Disposition'] = 'inline; filename="%s"' % \
                                      file_name.encode('utf-8')
    response['Content-Type'] = mimetype
    if not is_bot(request):
        tally_stat('case_page.static_file.served')
    return response
Пример #4
0
def serve_static_file(request, file_path=''):
    """Sends a static file to a user.

    This serves up the static case files such as the PDFs in a way that can be
    blocked from search engines if necessary. We do four things:
     - Look up the document  or audio file associated with the filepath
     - Check if it's blocked
     - If blocked, we set the x-robots-tag HTTP header
     - Serve up the file using Apache2's xsendfile
    """
    response = HttpResponse()
    file_loc = os.path.join(settings.MEDIA_ROOT, file_path.encode('utf-8'))
    if file_path.startswith('mp3'):
        item = get_object_or_404(Audio, local_path_mp3=file_path)
        mimetype = 'audio/mpeg'
    elif file_path.startswith('recap'):
        # Create an empty object, and set it to blocked. No need to hit the DB
        # since all RECAP documents are blocked.
        item = RECAPDocument()
        item.blocked = True
        mimetype = 'application/pdf'
    else:
        item = get_object_or_404(Opinion, local_path=file_path)
        item.blocked = item.cluster.blocked
        try:
            mimetype = magic.from_file(file_loc, mime=True)
        except IOError:
            raise Http404

    if item.blocked:
        response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'

    if settings.DEVELOPMENT:
        # X-Sendfile will only confuse you in a dev env.
        response.content = open(file_loc, 'r').read()
    else:
        response['X-Sendfile'] = file_loc
    file_name = file_path.split('/')[-1]
    response['Content-Disposition'] = 'attachment; filename="%s"' % \
                                      file_name.encode('utf-8')
    response['Content-Type'] = mimetype
    if not is_bot(request):
        tally_stat('case_page.static_file.served')
    return response
Пример #5
0
def make_thumb_if_needed(
    request: HttpRequest,
    rd: RECAPDocument,
) -> RECAPDocument:
    """Make a thumbnail for a RECAP Document, if needed

    If a thumbnail is needed, can be made and should be made, make one.

    :param request: The request sent to the server
    :param rd: A RECAPDocument object
    """
    needs_thumb = rd.thumbnail_status != THUMBNAIL_STATUSES.COMPLETE
    if all([needs_thumb, rd.has_valid_pdf, is_og_bot(request)]):
        make_png_thumbnail_for_instance(
            pk=rd.pk,
            klass=RECAPDocument,
            file_attr="filepath_local",
            max_dimension=1068,
        )
        rd.refresh_from_db()
    return rd
Пример #6
0
def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,)
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(
                pacer_case_id=pq.pacer_case_id, court_id=pq.court_id
            )
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully
            # the docket will be in place soon (it could be in a
            # different upload task that hasn't yet been processed).
            logger.warning(
                "Unable to find docket for processing queue '%s'. "
                "Retrying if max_retries is not exceeded." % pq
            )
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(
                    pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
                )
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None

        # Got the Docket, attempt to get/create the DocketEntry, and then
        # create the RECAPDocument
        try:
            de = DocketEntry.objects.get(
                docket=d, entry_number=pq.document_number
            )
        except DocketEntry.DoesNotExist as exc:
            logger.warning(
                "Unable to find docket entry for processing "
                "queue '%s'." % pq
            )
            msg = "Unable to find docket entry for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        else:
            # If we're here, we've got the docket and docket
            # entry, but were unable to find the document by
            # pacer_doc_id. This happens when pacer_doc_id is
            # missing, for example. ∴, try to get the document
            # from the docket entry.
            try:
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    document_number=pq.document_number,
                    attachment_number=pq.attachment_number,
                    document_type=document_type,
                )
            except (
                RECAPDocument.DoesNotExist,
                RECAPDocument.MultipleObjectsReturned,
            ):
                # Unable to find it. Make a new item.
                rd = RECAPDocument(
                    docket_entry=de,
                    pacer_doc_id=pq.pacer_doc_id,
                    document_type=document_type,
                )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    try:
        content = pq.filepath_local.read()
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    new_sha1 = sha1(content)
    existing_document = all(
        [
            rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path),
        ]
    )
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.path.split(".")[-1]
            rd.page_count = get_page_count(rd.filepath_local.path, extension)
            rd.file_size = rd.filepath_local.size

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1
        rd.date_upload = now()

    if not pq.debug:
        try:
            rd.save()
        except (IntegrityError, ValidationError):
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(
        pq,
        d_id=rd.docket_entry.docket_id,
        de_id=rd.docket_entry_id,
        rd_id=rd.pk,
    )
    mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True)
    return rd
Пример #7
0
    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        rd.is_available = False if availability is None else availability
        rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc')
                          or rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            try:
                rd.save(do_extraction=False, index=False)
            except IntegrityError as e:
                # This happens when a pacer_doc_id has been wrongly set as
                # the document_number, see for example, document 19 and
                # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml
                logger.error(
                    "Unable to create RECAPDocument for document #%s, "
                    "attachment #%s on entry: %s due to "
                    "IntegrityError." %
                    (rd.document_number, rd.attachment_number,
                     rd.docket_entry))
                return None
        return rd
Пример #8
0
def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                pq.status = pq.PROCESSING_FAILED
                pq.save()
                return None
            else:
                pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if (self.request.retries == self.max_retries) or pq.debug:
                    pq.status = pq.PROCESSING_FAILED
                    pq.save()
                    return None
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                    pq.save()
                    raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    existing_document = all([
        rd.sha1 == new_sha1,
        rd.is_available,
        rd.filepath_local and os.path.isfile(rd.filepath_local.path)
    ])
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    if not pq.debug:
        rd.save()

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=rd.docket_entry.docket_id,
                       de_id=rd.docket_entry_id, rd_id=rd.pk)
    return rd
Пример #9
0
    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(
            doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        if not rd.is_available:
            # If we can't parse the availability node (it returns None),
            # default it to False.
            availability = self.get_bool_from_node(doc_node, 'available')
            rd.is_available = False if availability is None else availability
        if not rd.sha1:
            rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc') or
                          rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path, extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            rd.save(do_extraction=False, index=False)
        return rd
Пример #10
0
    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(
            doc_node, 'pacer_doc_id')
        try:
            recap_doc = RECAPDocument.objects.get(
                pacer_doc_id=pacer_document_id
            )
        except RECAPDocument.DoesNotExist:
            recap_doc = RECAPDocument(
                pacer_doc_id=pacer_document_id,
                docket_entry=docket_entry
            )

        recap_doc.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        recap_doc.document_type = document_type or recap_doc.document_type
        recap_doc.document_number = entry_number or recap_doc.document_number
        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        recap_doc.is_available = False if availability is None else availability
        recap_doc.sha1 = self.get_str_from_node(doc_node, 'sha1')
        recap_doc.description = (
            self.get_str_from_node(doc_node, 'short_desc') or
            recap_doc.description
        )
        if recap_doc.is_available:
            recap_doc.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            recap_doc.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
        if document_type == RECAPDocument.ATTACHMENT:
            recap_doc.attachment_number = attachment_number
        if not debug:
            recap_doc.save()
        return recap_doc
Пример #11
0
def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if self.request.retries == self.max_retries:
                pq.status = pq.PROCESSING_FAILED
            else:
                pq.status = pq.QUEUED_FOR_RETRY
            pq.save()
            raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if self.request.retries == self.max_retries:
                    pq.status = pq.PROCESSING_FAILED
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    if all([rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path)]):
        # All good. Press on.
        new_document = False
    else:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        new_document = True
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    # Ditch the original file
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    rd.save()
    if new_document:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    return rd
Пример #12
0
    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            recap_doc = RECAPDocument.objects.get(
                pacer_doc_id=pacer_document_id)
        except RECAPDocument.DoesNotExist:
            recap_doc = RECAPDocument(pacer_doc_id=pacer_document_id,
                                      docket_entry=docket_entry)

        recap_doc.date_upload = self.get_datetime_from_node(
            doc_node, 'upload_date')
        recap_doc.document_type = document_type or recap_doc.document_type
        recap_doc.document_number = entry_number or recap_doc.document_number
        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        recap_doc.is_available = False if availability is None else availability
        recap_doc.sha1 = self.get_str_from_node(doc_node, 'sha1')
        recap_doc.description = (self.get_str_from_node(
            doc_node, 'short_desc') or recap_doc.description)
        if recap_doc.is_available:
            recap_doc.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            recap_doc.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
        if document_type == RECAPDocument.ATTACHMENT:
            recap_doc.attachment_number = attachment_number
        if not debug:
            recap_doc.save()
        return recap_doc