Exemplos de get_page_count em Python, exemplos de cl.scrapers.tasks.get_page_count em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pacer.py Projeto: nyimbi/courtlistener

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        rd.is_available = False if availability is None else availability
        rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc')
                          or rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            try:
                rd.save(do_extraction=False, index=False)
            except IntegrityError as e:
                # This happens when a pacer_doc_id has been wrongly set as
                # the document_number, see for example, document 19 and
                # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml
                logger.error(
                    "Unable to create RECAPDocument for document #%s, "
                    "attachment #%s on entry: %s due to "
                    "IntegrityError." %
                    (rd.document_number, rd.attachment_number,
                     rd.docket_entry))
                return None
        return rd

Exemplo n.º 2

0

Exibir arquivo

def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(
                error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: tasks.py Projeto: snorey/courtlistener

def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR,
                                        HTTP_504_GATEWAY_TIMEOUT]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: pacer.py Projeto: wethepeopleonline/courtlistener

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(
            doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        if not rd.is_available:
            # If we can't parse the availability node (it returns None),
            # default it to False.
            availability = self.get_bool_from_node(doc_node, 'available')
            rd.is_available = False if availability is None else availability
        if not rd.sha1:
            rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc') or
                          rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path, extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            rd.save(do_extraction=False, index=False)
        return rd

Exemplo n.º 5

0

Exibir arquivo

Arquivo: tasks.py Projeto: saizai/courtlistener

def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id,
                       pacer_doc_id, document_number, attachment_number):
    """After querying PACER and downloading a document, save it to the DB.

    :param rd_pk: The primary key of the RECAPDocument to work on
    :param response: A requests.Response object containing the PDF data.
    :param court_id: A CourtListener court ID to use for file names.
    :param pacer_case_id: The pacer_case_id to use in error logs.
    :param pacer_doc_id: The pacer_doc_id to use in error logs.
    :param document_number: The docket entry number for use in file names.
    :param attachment_number: The attachment number (if applicable) for use in
    file names.
    :return: A two-tuple of a boolean indicating success and a corresponding
    error/success message string.
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if response is None:
        msg = "Unable to get PDF for RECAP Document '%s' " \
              "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return False, msg

    file_name = get_document_filename(court_id, pacer_case_id, document_number,
                                      attachment_number)
    cf = ContentFile(response.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.file_size = rd.filepath_local.size
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save()

    # Make sure we mark the docket as needing upload
    changed = mark_ia_upload_needed(rd.docket_entry.docket)
    if changed:
        rd.docket_entry.docket.save()

    return True, 'Saved item successfully'

Exemplo n.º 6

0

Exibir arquivo

    def handle(self, *args, **options):
        raw_input(
            "This is a very primitive script that has serious performance "
            "issues with large datasets. Press any key to proceed anyway. "
            "Otherwise, press CTRL+C to exit.")
        cnt = Counter()
        for r in RECAPDocument.objects.all():
            try:
                path = r.filepath_local.path
            except ValueError:
                cnt['no_file'] += 1
            else:
                extension = path.split('.')[-1]
                count = get_page_count(path, extension)
                r.page_count = count
                r.save(do_extraction=False, index=False)
                cnt['successes'] += 1
                if count is not None:
                    cnt['total_pages'] += count

        print cnt

Exemplo n.º 7

0

Exibir arquivo

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        input("This is a very primitive script that has serious performance "
              "issues with large datasets. Press any key to proceed anyway. "
              "Otherwise, press CTRL+C to exit.")
        cnt = Counter()
        for rd in RECAPDocument.objects.all():
            try:
                path = rd.filepath_local.path
            except ValueError:
                cnt["no_file"] += 1
            else:
                extension = path.split(".")[-1]
                count = get_page_count(path, extension)
                rd.page_count = count
                rd.save(do_extraction=False, index=False)
                cnt["successes"] += 1
                if count is not None:
                    cnt["total_pages"] += count

        logger.info(cnt)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: cl_count_recap_pages.py Projeto: snorey/courtlistener

    def handle(self, *args, **options):
        raw_input(
            "This is a very primitive script that has serious performance "
            "issues with large datasets. Press any key to proceed anyway. "
            "Otherwise, press CTRL+C to exit."
        )
        cnt = Counter()
        for rd in RECAPDocument.objects.all():
            try:
                path = rd.filepath_local.path
            except ValueError:
                cnt['no_file'] += 1
            else:
                extension = path.split('.')[-1]
                count = get_page_count(path, extension)
                rd.page_count = count
                rd.save(do_extraction=False, index=False)
                cnt['successes'] += 1
                if count is not None:
                    cnt['total_pages'] += count

        print cnt

Exemplo n.º 9

0

Exibir arquivo

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Do nothing for items that don't start with zero. For ones that do,
        find the stripped version, fix it, download the correct item, extract
        it and finally save it to Solr.
        """

        if not entry_number.startswith('0'):
            # Only touch things where the new value leads with a zero.
            return None
        else:
            logger.info("  Doing docket_entry: %s, document_number, "
                        "%s and attachment number: %s" %
                        (docket_entry, entry_number, attachment_number))
        old_entry_number = int(entry_number)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=old_entry_number,
                attachment_number=attachment_number or None,
            )
            logger.info("    Found item.")
        except RECAPDocument.DoesNotExist:
            logger.info("    Failed to find item.")
            return None

        rd.document_number = entry_number
        if rd.is_available:
            new_ia = get_ia_document_url_from_path(self.path, entry_number,
                                                   attachment_number)
            logger.info("    Updating IA URL from %s to %s" %
                        (rd.filepath_ia, new_ia))
            rd.filepath_ia = new_ia

            if not os.path.isfile(rd.filepath_local.path):
                # Set the value correctly and get the file from IA if we don't
                # already have it.
                new_local_path = os.path.join(
                    'recap',
                    get_local_document_url_from_path(self.path, entry_number,
                                                     attachment_number),
                )
                logger.info("    Updating local path from %s to %s" %
                            (rd.filepath_local, new_local_path))
                rd.filepath_local = new_local_path
                filename = rd.filepath_ia.rsplit('/', 1)[-1]
                logger.info("    Downloading item with filename %s" % filename)
                if not debug:
                    download_recap_item(rd.filepath_ia, filename)
            else:
                logger.info("    File already on disk. Punting.")

            if rd.page_count is None:
                logger.info("    Getting page count.")
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        else:
            logger.info("    Item not available in RECAP. Punting.")
            return None

        if not debug:
            try:
                extract_recap_pdf(rd.pk, check_if_needed=False)
                rd.save(do_extraction=False, index=True)
                logger.info(
                    "    Item saved at https://www.courtlistener.com%s" %
                    rd.get_absolute_url())
            except IntegrityError:
                logger.info("    Integrity error while saving.")
                return None
        else:
            logger.info("    No save requested in debug mode.")

        return rd

Exemplo n.º 10

0

Exibir arquivo

def process_recap_pdf(self, pk):
    """Process an uploaded PDF from the RECAP API endpoint.

    :param pk: The PK of the processing queue item you want to work on.
    :return: A RECAPDocument object that was created or updated.
    """
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

    if pq.attachment_number is None:
        document_type = RECAPDocument.PACER_DOCUMENT
    else:
        document_type = RECAPDocument.ATTACHMENT

    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        if pq.pacer_case_id:
            rd = RECAPDocument.objects.get(
                docket_entry__docket__pacer_case_id=pq.pacer_case_id,
                pacer_doc_id=pq.pacer_doc_id,
            )
        else:
            # Sometimes we don't have the case ID from PACER. Try to make this
            # work anyway.
            rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,)
    except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
        try:
            d = Docket.objects.get(
                pacer_case_id=pq.pacer_case_id, court_id=pq.court_id
            )
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully
            # the docket will be in place soon (it could be in a
            # different upload task that hasn't yet been processed).
            logger.warning(
                "Unable to find docket for processing queue '%s'. "
                "Retrying if max_retries is not exceeded." % pq
            )
            error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(
                    pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
                )
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None

        # Got the Docket, attempt to get/create the DocketEntry, and then
        # create the RECAPDocument
        try:
            de = DocketEntry.objects.get(
                docket=d, entry_number=pq.document_number
            )
        except DocketEntry.DoesNotExist as exc:
            logger.warning(
                "Unable to find docket entry for processing "
                "queue '%s'." % pq
            )
            msg = "Unable to find docket entry for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
                return None
            else:
                mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
                raise self.retry(exc=exc)
        else:
            # If we're here, we've got the docket and docket
            # entry, but were unable to find the document by
            # pacer_doc_id. This happens when pacer_doc_id is
            # missing, for example. ∴, try to get the document
            # from the docket entry.
            try:
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    document_number=pq.document_number,
                    attachment_number=pq.attachment_number,
                    document_type=document_type,
                )
            except (
                RECAPDocument.DoesNotExist,
                RECAPDocument.MultipleObjectsReturned,
            ):
                # Unable to find it. Make a new item.
                rd = RECAPDocument(
                    docket_entry=de,
                    pacer_doc_id=pq.pacer_doc_id,
                    document_type=document_type,
                )

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    try:
        content = pq.filepath_local.read()
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    new_sha1 = sha1(content)
    existing_document = all(
        [
            rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path),
        ]
    )
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        if not pq.debug:
            rd.filepath_local.save(file_name, cf, save=False)

            # Do page count and extraction
            extension = rd.filepath_local.path.split(".")[-1]
            rd.page_count = get_page_count(rd.filepath_local.path, extension)
            rd.file_size = rd.filepath_local.size

        rd.ocr_status = None
        rd.is_available = True
        rd.sha1 = new_sha1
        rd.date_upload = now()

    if not pq.debug:
        try:
            rd.save()
        except (IntegrityError, ValidationError):
            msg = "Duplicate key on unique_together constraint"
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            rd.filepath_local.delete(save=False)
            return None

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(
        pq,
        d_id=rd.docket_entry.docket_id,
        de_id=rd.docket_entry_id,
        rd_id=rd.pk,
    )
    mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True)
    return rd

Exemplo n.º 11

0

Exibir arquivo

def get_pacer_doc_by_rd_and_description(self,
                                        rd_pk,
                                        description_re,
                                        session,
                                        fallback_to_main_doc=False,
                                        tag=None):
    """Using a RECAPDocument object ID and a description of a document, get the
    document from PACER.

    This function was originally meant to get civil cover sheets, but can be
    repurposed as needed.

    :param rd_pk: The PK of a RECAPDocument object to use as a source.
    :param description_re: A compiled regular expression to search against the
    description provided by the attachment page.
    :param session: The PACER session object to use.
    :param fallback_to_main_doc: Should we grab the main doc if none of the
    attachments match the regex?
    :param tag: A tag name to apply to any downloaded content.
    :return: None
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    if not rd.pacer_doc_id:
        # Some docket entries are just text/don't have a pacer_doc_id.
        self.request.callbacks = None
        return

    d = rd.docket_entry.docket
    pacer_court_id = map_cl_to_pacer_id(d.court_id)
    att_report = AttachmentPage(pacer_court_id, session)
    try:
        att_report.query(rd.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % rd)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    att_found = None
    for attachment in att_report.data.get('attachments', []):
        if description_re.search(attachment['description']):
            att_found = attachment.copy()
            document_type = RECAPDocument.ATTACHMENT
            break

    if not att_found:
        if fallback_to_main_doc:
            logger.info("Falling back to main document for pacer_doc_id: %s" %
                        rd.pacer_doc_id)
            att_found = att_report.data
            document_type = RECAPDocument.PACER_DOCUMENT
        else:
            msg = "Aborting. Did not find civil cover sheet for %s." % rd
            logger.error(msg)
            self.request.callbacks = None
            return

    if not att_found.get('pacer_doc_id'):
        logger.warn("No pacer_doc_id for document (is it sealed?)")
        self.request.callbacks = None
        return

    # Try to find the attachment already in the collection
    rd, _ = RECAPDocument.objects.get_or_create(
        docket_entry=rd.docket_entry,
        attachment_number=att_found.get('attachment_number'),
        document_number=rd.document_number,
        pacer_doc_id=att_found['pacer_doc_id'],
        document_type=document_type,
        defaults={
            'date_upload': now(),
        },
    )
    # Replace the description if we have description data. Else fallback on old.
    rd.description = att_found.get('description', '') or rd.description
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        rd.tags.add(tag)

    if rd.is_available:
        # Great. Call it a day.
        rd.save(do_extraction=False, index=False)
        return

    # Not available. Go get it.
    try:
        pacer_case_id = rd.docket_entry.docket.pacer_case_id
        r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id'])
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id'])
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [
                HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT
        ]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \
              (rd, pacer_court_id, rd.pacer_doc_id)
        logger.error(msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        d.court_id,
        pacer_case_id,
        rd.document_number,
        rd.attachment_number,
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, force it all to be
    # bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save, extract, then save to Solr. Skip OCR for now. Don't do these async.
    rd.save(do_extraction=False, index=False)
    extract_recap_pdf(rd.pk, skip_ocr=True)
    add_or_update_recap_document([rd.pk])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: tasks.py Projeto: haoyangt/courtlistener

def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq))
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if (self.request.retries == self.max_retries) or pq.debug:
                pq.status = pq.PROCESSING_FAILED
                pq.save()
                return None
            else:
                pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if (self.request.retries == self.max_retries) or pq.debug:
                    pq.status = pq.PROCESSING_FAILED
                    pq.save()
                    return None
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                    pq.save()
                    raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    existing_document = all([
        rd.sha1 == new_sha1,
        rd.is_available,
        rd.filepath_local and os.path.isfile(rd.filepath_local.path)
    ])
    if not existing_document:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    if not pq.debug:
        rd.save()

    if not existing_document and not pq.debug:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=rd.docket_entry.docket_id,
                       de_id=rd.docket_entry_id, rd_id=rd.pk)
    return rd

Exemplo n.º 13

0

Exibir arquivo

Arquivo: tasks.py Projeto: snorey/courtlistener

def process_recap_pdf(self, pk):
    """Save a RECAP PDF to the database."""
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item: %s" % pq)
    try:
        rd = RECAPDocument.objects.get(
            docket_entry__docket__pacer_case_id=pq.pacer_case_id,
            pacer_doc_id=pq.pacer_doc_id,
        )
    except RECAPDocument.DoesNotExist:
        try:
            d = Docket.objects.get(pacer_case_id=pq.pacer_case_id,
                                   court_id=pq.court_id)
        except Docket.DoesNotExist as exc:
            # No Docket and no RECAPDocument. Do a retry. Hopefully the docket
            # will be in place soon (it could be in a different upload task that
            # hasn't yet been processed).
            logger.warning("Unable to find docket for processing queue '%s'. "
                           "Retrying if max_retries is not exceeded." % pq)
            pq.error_message = "Unable to find docket for item."
            if self.request.retries == self.max_retries:
                pq.status = pq.PROCESSING_FAILED
            else:
                pq.status = pq.QUEUED_FOR_RETRY
            pq.save()
            raise self.retry(exc=exc)
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to save '%s'" % pq
            logger.error(msg)
            pq.error_message = msg
            pq.status = pq.PROCESSING_FAILED
            pq.save()
            return None
        else:
            # Got the Docket, attempt to get/create the DocketEntry, and then
            # create the RECAPDocument
            try:
                de = DocketEntry.objects.get(docket=d,
                                             entry_number=pq.document_number)
            except DocketEntry.DoesNotExist as exc:
                logger.warning("Unable to find docket entry for processing "
                               "queue '%s'. Retrying if max_retries is not "
                               "exceeded." % pq)
                pq.error_message = "Unable to find docket entry for item."
                if self.request.retries == self.max_retries:
                    pq.status = pq.PROCESSING_FAILED
                else:
                    pq.status = pq.QUEUED_FOR_RETRY
                pq.save()
                raise self.retry(exc=exc)

        # All objects accounted for. Make some data.
        rd = RECAPDocument(
            docket_entry=de,
            pacer_doc_id=pq.pacer_doc_id,
            date_upload=timezone.now(),
        )
        if pq.attachment_number is None:
            rd.document_type = RECAPDocument.PACER_DOCUMENT
        else:
            rd.document_type = RECAPDocument.ATTACHMENT

    rd.document_number = pq.document_number
    rd.attachment_number = pq.attachment_number

    # Do the file, finally.
    content = pq.filepath_local.read()
    new_sha1 = hashlib.sha1(content).hexdigest()
    if all([rd.sha1 == new_sha1,
            rd.is_available,
            rd.filepath_local and os.path.isfile(rd.filepath_local.path)]):
        # All good. Press on.
        new_document = False
    else:
        # Different sha1, it wasn't available, or it's missing from disk. Move
        # the new file over from the processing queue storage.
        new_document = True
        cf = ContentFile(content)
        file_name = get_document_filename(
            rd.docket_entry.docket.court_id,
            rd.docket_entry.docket.pacer_case_id,
            rd.document_number,
            rd.attachment_number,
        )
        rd.filepath_local.save(file_name, cf, save=False)
        rd.is_available = True
        rd.sha1 = new_sha1

        # Do page count and extraction
        extension = rd.filepath_local.path.split('.')[-1]
        rd.page_count = get_page_count(rd.filepath_local.path, extension)
        rd.ocr_status = None

    # Ditch the original file
    pq.filepath_local.delete(save=False)
    pq.error_message = ''  # Clear out errors b/c successful
    pq.status = pq.PROCESSING_SUCCESSFUL
    pq.save()

    rd.save()
    if new_document:
        extract_recap_pdf(rd.pk)
        add_or_update_recap_document([rd.pk], force_commit=False)

    return rd

Exemplo n.º 14

0

Exibir arquivo

Arquivo: cleanup_636_pacer_leading_zeros.py Projeto: snorey/courtlistener

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Do nothing for items that don't start with zero. For ones that do,
        find the stripped version, fix it, download the correct item, extract
        it and finally save it to Solr.
        """

        if not entry_number.startswith('0'):
            # Only touch things where the new value leads with a zero.
            return None
        else:
            logger.info("  Doing docket_entry: %s, document_number, "
                        "%s and attachment number: %s" %
                        (docket_entry, entry_number, attachment_number))
        old_entry_number = int(entry_number)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=old_entry_number,
                attachment_number=attachment_number or None,
            )
            logger.info("    Found item.")
        except RECAPDocument.DoesNotExist:
            logger.info("    Failed to find item.")
            return None

        rd.document_number = entry_number
        if rd.is_available:
            new_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            logger.info("    Updating IA URL from %s to %s" %
                        (rd.filepath_ia, new_ia))
            rd.filepath_ia = new_ia

            if not os.path.isfile(rd.filepath_local.path):
                # Set the value correctly and get the file from IA if we don't
                # already have it.
                new_local_path = os.path.join(
                    'recap',
                    get_local_document_url_from_path(self.path, entry_number,
                                                     attachment_number),
                )
                logger.info("    Updating local path from %s to %s" %
                            (rd.filepath_local, new_local_path))
                rd.filepath_local = new_local_path
                filename = rd.filepath_ia.rsplit('/', 1)[-1]
                logger.info("    Downloading item with filename %s" % filename)
                if not debug:
                    download_recap_item(rd.filepath_ia, filename)
            else:
                logger.info("    File already on disk. Punting.")

            if rd.page_count is None:
                logger.info("    Getting page count.")
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path, extension)
        else:
            logger.info("    Item not available in RECAP. Punting.")
            return None

        if not debug:
            try:
                extract_recap_pdf(rd.pk, check_if_needed=False)
                rd.save(do_extraction=False, index=True)
                logger.info("    Item saved at https://www.courtlistener.com%s"
                            % rd.get_absolute_url())
            except IntegrityError:
                logger.info("    Integrity error while saving.")
                return None
        else:
            logger.info("    No save requested in debug mode.")

        return rd