Пример #1
0
def merge_pacer_docket_into_cl_docket(
    d,
    pacer_case_id,
    docket_data,
    report,
    appellate=False,
    tag_names=None,
):
    # Ensure that we set the case ID. This is needed on dockets that have
    # matching docket numbers, but that never got PACER data before. This was
    # previously rare, but since we added the FJC data to the dockets table,
    # this is now quite common.
    if not d.pacer_case_id:
        d.pacer_case_id = pacer_case_id

    d.add_recap_source()
    update_docket_metadata(d, docket_data)
    d.save()

    if appellate:
        d, og_info = update_docket_appellate_metadata(d, docket_data)
        if og_info is not None:
            og_info.save()
            d.originating_court_information = og_info

    tags = add_tags_to_objs(tag_names, [d])

    # Add the HTML to the docket in case we need it someday.
    upload_type = (UPLOAD_TYPE.APPELLATE_DOCKET
                   if appellate else UPLOAD_TYPE.DOCKET)
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=upload_type)
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data["docket_entries"], tags=tags)
    add_parties_and_attorneys(d, docket_data["parties"])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return rds_created, content_updated
Пример #2
0
def process_recap_docket_history_report(self, pk):
    """Process the docket history report.

    :param pk: The primary key of the processing queue item you want to work on
    :returns: A dict indicating whether the docket needs Solr re-indexing.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode('utf-8')
    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed for item %s" % pq)

    # Merge the contents of the docket into CL.
    d, count = find_docket_object(pq.court_id, pq.pacer_case_id,
                           data['docket_number'])
    if count > 1:
        logger.info("Found %s dockets during lookup. Choosing oldest." % count)
        d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    try:
        d.save()
    except IntegrityError as exc:
        logger.warning("Race condition experienced while attempting docket "
                       "save.")
        error_message = "Unable to save docket due to IntegrityError."
        if self.request.retries == self.max_retries:
            mark_pq_status(pq, error_message, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None
        else:
            mark_pq_status(pq, error_message, pq.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=DOCKET_HISTORY_REPORT)
    pacer_file.filepath.save(
        'docket_history.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Пример #3
0
def process_recap_attachment(self, pk):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode('utf-8')
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if pq.pacer_case_id in ['undefined', 'null']:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get('pacer_case_id')
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            'pacer_doc_id': att_data['pacer_doc_id'],
            'docket_entry__docket__court': pq.court,
        }
        if pq.pacer_case_id:
            params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = "Too many documents found when attempting to associate " \
              "attachment data"
        mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None
        else:
            mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data['document_number'] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data['document_number'] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(content_object=de,
                                    upload_type=ATTACHMENT_PAGE)
        pacer_file.filepath.save(
            'attachment_page.html',  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        for attachment in att_data['attachments']:
            if all([attachment['attachment_number'],
                    # Missing on sealed items.
                    attachment.get('pacer_doc_id', False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment['page_count'] is not None,
                    attachment['description']]):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data['document_number'],
                    attachment_number=attachment['attachment_number'],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ['description', 'pacer_doc_id']:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True
                if needs_save:
                    rd.save()

                # Do *not* do this async — that can cause race conditions.
                add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(rds_created, pq.court_id,
                             main_rd.docket_entry.docket.date_filed)
Пример #4
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL.
    d, count = find_docket_object(pq.court_id, pq.pacer_case_id,
                                  data['docket_number'])
    if count > 1:
        logger.info("Found %s dockets during lookup. Choosing oldest." % count)
        d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries'])
    add_parties_and_attorneys(d, data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Пример #5
0
def get_docket_by_pacer_case_id(self,
                                pacer_case_id,
                                court_id,
                                session,
                                tag=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, needs_solr_update = add_docket_entries(
        d, docket_data['docket_entries'], tag=tag)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Пример #6
0
def process_recap_attachment(self, pk, tag_names=None):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :param tag_names: A list of tag names to add to all items created or
    modified in this function.
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    with open(pq.filepath_local.path) as f:
        text = f.read().decode("utf-8")
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    if att_data == {}:
        # Bad attachment page.
        msg = "Not a valid attachment page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    if pq.pacer_case_id in ["undefined", "null"]:
        # Bad data from the client. Fix it with parsed data.
        pq.pacer_case_id = att_data.get("pacer_case_id")
        pq.save()

    # Merge the contents of the data into CL.
    try:
        params = {
            "pacer_doc_id": att_data["pacer_doc_id"],
            "docket_entry__docket__court": pq.court,
        }
        if pq.pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        msg = (
            "Too many documents found when attempting to associate "
            "attachment data"
        )
        mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
        return None
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if att_data["document_number"] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data["document_number"] = main_rd.document_number

    rds_created = []
    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(
            content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
        )
        pacer_file.filepath.save(
            "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        tags = []
        if tag_names:
            for tag_name in tag_names:
                tag, _ = Tag.objects.get_or_create(name=tag_name)
                tags.append(tag)
        for attachment in att_data["attachments"]:
            if all(
                [
                    attachment["attachment_number"],
                    # Missing on sealed items.
                    attachment.get("pacer_doc_id", False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment["page_count"] is not None,
                    attachment["description"],
                ]
            ):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data["document_number"],
                    attachment_number=attachment["attachment_number"],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                if created:
                    rds_created.append(rd)
                needs_save = False
                for field in ["description", "pacer_doc_id"]:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True

                # Only set page_count and file_size if they're blank, in case
                # we got the real value by measuring.
                if rd.page_count is None:
                    rd.page_count = attachment["page_count"]
                if rd.file_size is None and attachment["file_size_str"]:
                    try:
                        rd.file_size = convert_size_to_bytes(
                            attachment["file_size_str"]
                        )
                    except ValueError:
                        pass

                if needs_save:
                    rd.save()
                if tags:
                    for tag in tags:
                        tag.tag_object(rd)

                # Do *not* do this async — that can cause race conditions.
                add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
    process_orphan_documents(
        rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed
    )
    changed = mark_ia_upload_needed(de.docket)
    if changed:
        de.docket.save()
Пример #7
0
def process_recap_attachment(self, pk):
    """Process an uploaded attachment page from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on
    :return: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    att_page._parse_text(text)
    att_data = att_page.data
    logger.info("Parsing completed for item %s" % pq)

    # Merge the contents of the data into CL.
    try:
        rd = RECAPDocument.objects.get(
            pacer_doc_id=att_data['pacer_doc_id'],
            docket_entry__docket__court=pq.court,
        )
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None
        else:
            mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = rd.docket_entry
    if att_data['document_number'] is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        att_data['document_number'] = rd.document_number

    if not pq.debug:
        # Save the old HTML to the docket entry.
        pacer_file = PacerHtmlFiles(content_object=de)
        pacer_file.filepath.save(
            'attachment_page.html',  # Irrelevant b/c UUIDFileSystemStorage
            ContentFile(text),
        )

        # Create/update the attachment items.
        for attachment in att_data['attachments']:
            if all([
                    attachment['attachment_number'],
                    # Missing on sealed items.
                    attachment.get('pacer_doc_id', False),
                    # Missing on some restricted docs (see Juriscraper)
                    attachment['page_count'] is not None,
                    attachment['description']
            ]):
                rd, created = RECAPDocument.objects.update_or_create(
                    docket_entry=de,
                    document_number=att_data['document_number'],
                    attachment_number=attachment['attachment_number'],
                    document_type=RECAPDocument.ATTACHMENT,
                )
                needs_save = False
                for field in ['description', 'pacer_doc_id']:
                    if attachment[field]:
                        setattr(rd, field, attachment[field])
                        needs_save = True
                if needs_save:
                    try:
                        rd.save()
                    except IntegrityError:
                        # Happens when we hit courtlistener/issues#765, in which
                        # we violate the unique constraint on pacer_doc_id.
                        continue

                # Do *not* do this async — that can cause race conditions.
                add_or_update_recap_document([rd.pk], force_commit=False)

    mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
Пример #8
0
def process_recap_docket_history_report(self, pk):
    """Process the docket history report.

    :param pk: The primary key of the processing queue item you want to work on
    :returns: A dict indicating whether the docket needs Solr re-indexing.
    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id))
    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed for item %s" % pq)

    if data == {}:
        # Bad docket history page.
        msg = "Not a valid docket history page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    try:
        d.save()
    except IntegrityError as exc:
        logger.warning(
            "Race condition experienced while attempting docket save."
        )
        error_message = "Unable to save docket due to IntegrityError."
        if self.request.retries == self.max_retries:
            mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        else:
            mark_pq_status(
                pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
            )
            raise self.retry(exc=exc)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT
    )
    pacer_file.filepath.save(
        # We only care about the ext w/UUIDFileSystemStorage
        "docket_history.html",
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Пример #9
0
def get_docket_by_pacer_case_id(self,
                                data,
                                court_id,
                                cookies,
                                tag_names=None,
                                **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param data: A dict containing:
        Required: 'pacer_case_id': The internal case ID of the item in PACER.
        Optional: 'docket_pk': The ID of the docket to work on to avoid lookups
                  if it's known in advance.
    :param court_id: A courtlistener court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names that should be stored with the item
    in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    :return: A dict indicating if we need to update Solr.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = DocketReport(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    if data.get('docket_pk') is not None:
        d = Docket.objects.get(pk=data['docket_pk'])
    else:
        try:
            d = Docket.objects.get(
                pacer_case_id=pacer_case_id,
                court_id=court_id,
            )
        except Docket.DoesNotExist:
            d = None
        except Docket.MultipleObjectsReturned:
            d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Пример #10
0
def do_case_query_by_pacer_case_id(self,
                                   data,
                                   court_id,
                                   cookies,
                                   tag_names=None):
    """Run a case query (iquery.pl) query on a case and save the data

    :param data: A dict containing at least the following: {
        'pacer_case_id': The internal pacer case ID for the item.
    }
    :param court_id: A courtlistener court ID
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: A list of tag names to associate with the docket when
    saving it in the DB.
    :return: A dict with the pacer_case_id and docket_pk values.
    """
    s = PacerSession(cookies=cookies)
    if data is None:
        logger.info("Empty data argument. Terminating " "chains and exiting.")
        self.request.callbacks = None
        return

    pacer_case_id = data.get('pacer_case_id')
    report = CaseQuery(map_cl_to_pacer_id(court_id), s)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    report.query(pacer_case_id)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" %
                (court_id, pacer_case_id))

    if not docket_data:
        logger.info("No valid docket data for %s.%s", court_id, pacer_case_id)
        self.request.callbacks = None
        return

    # Merge the contents into CL.
    if d is None:
        d, count = find_docket_object(court_id, pacer_case_id,
                                      docket_data['docket_number'])
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d.save()

    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.CASE_REPORT_PAGE)
    pacer_file.filepath.save(
        'case_report.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    logger.info("Created/updated docket: %s" % d)
    return {
        'pacer_case_id': pacer_case_id,
        'docket_pk': d.pk,
    }
Пример #11
0
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session,
                                tag=None, **kwargs):
    """Get a docket by PACER case id, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param pacer_case_id: The internal case ID of the item in PACER.
    :param court_id: A courtlistener court ID.
    :param session: A valid PacerSession object.
    :param tag: The tag name that should be stored with the item in the DB.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id))
    try:
        d = Docket.objects.get(
            pacer_case_id=pacer_case_id,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is not None:
        first_missing_id = get_first_missing_de_number(d)
        if d is not None and first_missing_id > 1:
            # We don't have to get the whole thing!
            kwargs.setdefault('doc_num_start', first_missing_id)

    report.query(pacer_case_id, **kwargs)
    docket_data = report.data
    logger.info("Querying and parsing complete for %s.%s" % (court_id,
                                                             pacer_case_id))

    # Merge the contents into CL.
    try:
        if d is None:
            d = Docket.objects.get(
                Q(pacer_case_id=pacer_case_id) |
                Q(docket_number=docket_data['docket_number']),
                court_id=court_id,
            )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pacer_case_id,
            court_id=court_id
        )
    except Docket.MultipleObjectsReturned:
        logger.error("Too many dockets returned when trying to look up '%s.%s'" %
                     (court_id, pacer_case_id))
        return None

    update_docket_metadata(d, docket_data)
    d.save()
    if tag is not None:
        tag, _ = Tag.objects.get_or_create(name=tag)
        d.tags.add(tag)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s.%s'" % (docket_entry['document_number'],
                                              court_id, pacer_case_id)
            )
            continue
        else:
            if tag is not None:
                de.tags.add(tag)

        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                rd = RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                # Race condition. The item was created after our get failed.
                rd = RECAPDocument.objects.get(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry "
                "number: '%s', docket: %s" % (docket_entry['document_number'], d)
            )
            continue

        rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id']
        if tag is not None:
            rd.tags.add(tag)

    add_parties_and_attorneys(d, docket_data['parties'])
    logger.info("Created/updated docket: %s" % d)

    return d
Пример #12
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or recap document
            // was created (implying a Solr needs updating).
            'needs_solr_update': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = pq.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.callbacks = None
        return None

    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        self.request.callbacks = None
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity. Note that pacer_case_id is required for Docket
    # uploads.
    d = None
    for kwargs in [{'pacer_case_id': pq.pacer_case_id,
                    'docket_number': docket_data['docket_number']},
                   {'pacer_case_id': pq.pacer_case_id},
                   {'docket_number': docket_data['docket_number'],
                    'pacer_case_id': None}]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            self.request.callbacks = None
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.callbacks = None
        return {'docket_pk': d.pk, 'needs_solr_update': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries & documents
    rds_created = []
    needs_solr_update = False
    for docket_entry in docket_data['docket_entries']:
        try:
            de, de_created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        if de_created:
            needs_solr_update = True

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        params = {
            'docket_entry': de,
            # No attachments when uploading dockets.
            'document_type': RECAPDocument.PACER_DOCUMENT,
            'document_number': docket_entry['document_number'],
        }
        try:
            rd = RECAPDocument.objects.get(**params)
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument.objects.create(
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
                **params
            )
            rds_created.append(rd)
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'needs_solr_update': bool(rds_created or needs_solr_update),
    }
Пример #13
0
def merge_attachment_page_data(
    court: Court,
    pacer_case_id: int,
    pacer_doc_id: int,
    document_number: int,
    text: str,
    attachment_dicts: List[Dict[str, Union[int, str]]],
    debug: bool = False,
) -> Tuple[List[RECAPDocument], DocketEntry]:
    """Merge attachment page data into the docket

    :param court: The court object we're working with
    :param pacer_case_id: A PACER case ID
    :param pacer_doc_id: A PACER document ID
    :param document_number: The docket entry number
    :param text: The text of the attachment page
    :param attachment_dicts: A list of Juriscraper-parsed dicts for each
    attachment.
    :param debug: Whether to do saves during this process.
    :return: A list of RECAPDocuments modified or created during the process,
    and the DocketEntry object associated with the RECAPDocuments
    :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist
    """
    try:
        params = {
            "pacer_doc_id": pacer_doc_id,
            "docket_entry__docket__court": court,
        }
        if pacer_case_id:
            params["docket_entry__docket__pacer_case_id"] = pacer_case_id
        main_rd = RECAPDocument.objects.get(**params)
    except RECAPDocument.MultipleObjectsReturned as exc:
        # Unclear how to proceed and we don't want to associate this data with
        # the wrong case. We must punt.
        raise exc
    except RECAPDocument.DoesNotExist as exc:
        # Can't find the docket to associate with the attachment metadata
        # It may be possible to go look for orphaned documents at this stage
        # and to then add them here, as we do when adding dockets. This need is
        # particularly acute for those that get free look emails and then go to
        # the attachment page.
        raise exc

    # We got the right item. Update/create all the attachments for
    # the docket entry.
    de = main_rd.docket_entry
    if document_number is None:
        # Bankruptcy attachment page. Use the document number from the Main doc
        document_number = main_rd.document_number

    if debug:
        return [], de

    # Save the old HTML to the docket entry.
    pacer_file = PacerHtmlFiles(
        content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE
    )
    pacer_file.filepath.save(
        "attachment_page.html",  # Irrelevant b/c UUIDFileSystemStorage
        ContentFile(text),
    )

    # Create/update the attachment items.
    rds_created = []
    rds_affected = []
    for attachment in attachment_dicts:
        sanity_checks = [
            attachment["attachment_number"],
            # Missing on sealed items.
            attachment.get("pacer_doc_id", False),
            # Missing on some restricted docs (see Juriscraper)
            attachment["page_count"] is not None,
            attachment["description"],
        ]
        if not all(sanity_checks):
            continue

        rd, created = RECAPDocument.objects.update_or_create(
            docket_entry=de,
            document_number=document_number,
            attachment_number=attachment["attachment_number"],
            document_type=RECAPDocument.ATTACHMENT,
        )
        if created:
            rds_created.append(rd)
        rds_affected.append(rd)

        for field in ["description", "pacer_doc_id"]:
            if attachment[field]:
                setattr(rd, field, attachment[field])

        # Only set page_count and file_size if they're blank, in case
        # we got the real value by measuring.
        if rd.page_count is None:
            rd.page_count = attachment["page_count"]
        if rd.file_size is None and attachment["file_size_str"]:
            try:
                rd.file_size = convert_size_to_bytes(
                    attachment["file_size_str"]
                )
            except ValueError:
                pass
        rd.save()

        # Do *not* do this async — that can cause race conditions.
        add_items_to_solr([rd.pk], "search.RECAPDocument")

    mark_ia_upload_needed(de.docket, save_docket=True)
    process_orphan_documents(
        rds_created, court.pk, main_rd.docket_entry.docket.date_filed
    )
    return rds_affected, de
Пример #14
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode()
    except IOError as exc:
        msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    if "History/Documents" in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.chain = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info(f"Parsing completed of item {pq}")

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d = find_docket_object(pq.court_id, pq.pacer_case_id,
                           data["docket_number"])

    d.add_recap_source()
    update_docket_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(d,
                                                      data["docket_entries"])
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Пример #15
0
def get_appellate_docket_by_docket_number(self,
                                          docket_number,
                                          court_id,
                                          cookies,
                                          tag_names=None,
                                          **kwargs):
    """Get a docket by docket number, CL court ID, and a collection of kwargs
    that can be passed to the DocketReport query.

    For details of acceptable parameters, see DocketReport.query()

    :param docket_number: The docket number of the case.
    :param court_id: A courtlistener/PACER appellate court ID.
    :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a
    logged-in PACER user.
    :param tag_names: The tag name that should be stored with the item in the
    DB, if desired.
    :param kwargs: A variety of keyword args to pass to DocketReport.query().
    """
    s = PacerSession(cookies=cookies)
    report = AppellateDocketReport(court_id, s)
    logging_id = "%s - %s" % (court_id, docket_number)
    logger.info("Querying docket report %s", logging_id)

    try:
        report.query(docket_number, **kwargs)
    except requests.RequestException as e:
        logger.warning("Problem getting docket %s", logging_id)
        if self.request.retries == self.max_retries:
            self.request.callbacks = None
            return None
        raise self.retry(exc=e)

    docket_data = report.data
    logger.info('Querying and parsing complete for %s', logging_id)

    if docket_data == {}:
        logger.info("Unable to find docket: %s", logging_id)
        self.request.callbacks = None
        return None

    try:
        d = Docket.objects.get(
            docket_number=docket_number,
            court_id=court_id,
        )
    except Docket.DoesNotExist:
        d = None
    except Docket.MultipleObjectsReturned:
        d = None

    if d is None:
        d, count = find_docket_object(court_id, docket_number, docket_number)
        if count > 1:
            d = d.earliest('date_created')

    add_recap_source(d)
    update_docket_metadata(d, docket_data)
    d, og_info = update_docket_appellate_metadata(d, docket_data)
    if not d.pacer_case_id:
        d.pacer_case_id = docket_number

    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()
    tags = []
    if tag_names is not None:
        for tag_name in tag_names:
            tag, _ = Tag.objects.get_or_create(name=tag_name)
            tag.tag_object(d)
            tags.append(tag)

    # Save the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.APPELLATE_DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(report.response.text),
    )

    rds_created, content_updated = add_docket_entries(
        d, docket_data['docket_entries'], tags=tags)
    add_parties_and_attorneys(d, docket_data['parties'])
    process_orphan_documents(rds_created, d.court_id, d.date_filed)
    logger.info("Created/updated docket: %s" % d)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
Пример #16
0
def process_recap_claims_register(self, pk):
    """Merge bankruptcy claims registry HTML into RECAP

    :param pk: The primary key of the processing queue item you want to work on
    :type pk: int
    :return: None
    :rtype: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    if pq.debug:
        # Proper debugging not supported on this endpoint. Just abort.
        mark_pq_successful(pq)
        self.request.chain = None
        return None

    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report = ClaimsRegister(map_cl_to_pacer_id(pq.court_id))
    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed for item %s" % pq)

    if not data:
        # Bad HTML
        msg = "Not a valid claims registry page or other parsing failure"
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    # Merge the contents into CL
    d.add_recap_source()
    update_docket_metadata(d, data)

    try:
        d.save()
    except IntegrityError as exc:
        logger.warning(
            "Race condition experienced while attempting docket save."
        )
        error_message = "Unable to save docket due to IntegrityError."
        if self.request.retries == self.max_retries:
            mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        else:
            mark_pq_status(
                pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
            )
            raise self.retry(exc=exc)

    add_bankruptcy_data_to_docket(d, data)
    add_claims_to_docket(d, data["claims"])
    logger.info("Created/updated claims data for %s", pq)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.CLAIMS_REGISTER
    )
    pacer_file.filepath.save(
        # We only care about the ext w/UUIDFileSystemStorage
        "claims_registry.html",
        ContentFile(text),
    )

    mark_pq_successful(pq, d_id=d.pk)
    return {"docket_pk": d.pk}
Пример #17
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    param pk: The primary key of the processing queue item you want to work on.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    pq.status = pq.PROCESSING_IN_PROGRESS
    pq.save()
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    # Merge the contents of the docket into CL
    try:
        d = Docket.objects.get(
            Q(pacer_case_id=pq.pacer_case_id) |
            Q(docket_number=docket_data['docket_number']),
            court_id=pq.court_id,
        )
        # Add RECAP as a source if it's not already.
        if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
            d.source = Docket.RECAP_AND_SCRAPER
        elif d.source == Docket.COLUMBIA:
            d.source = Docket.COLUMBIA_AND_RECAP
        elif d.source == Docket.COLUMBIA_AND_SCRAPER:
            d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
    except Docket.DoesNotExist:
        d = Docket(
            source=Docket.RECAP,
            pacer_case_id=pq.pacer_case_id,
            court_id=pq.court_id
        )
    except Docket.MultipleObjectsReturned:
        msg = "Too many dockets found when trying to look up '%s'" % pq
        logger.error(msg)
        pq.error_message = msg
        pq.status = pq.PROCESSING_FAILED
        pq.save()
        return None

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                }
            )
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            RECAPDocument.objects.create(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
                pacer_doc_id=docket_entry['pacer_doc_id'],
                is_available=False,
            )
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" % (docket_entry['document_number'], pq)
            )
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d
Пример #18
0
def process_recap_appellate_docket(self, pk):
    """Process an uploaded appellate docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info(
        "Processing Appellate RECAP item"
        " (debug is: %s): %s" % (pq.debug, pq)
    )

    report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET
    )
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
Пример #19
0
def process_recap_docket(pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work on.
    :return: The docket that's created or updated.
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')
    report._parse_text(text)
    docket_data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if docket_data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, pq.INVALID_CONTENT)
        return None

    # Merge the contents of the docket into CL. Attempt several lookups of
    # decreasing specificity.
    d = None
    for kwargs in [{
            'pacer_case_id': pq.pacer_case_id,
            'docket_number': docket_data['docket_number']
    }, {
            'pacer_case_id': pq.pacer_case_id
    }, {
            'docket_number': docket_data['docket_number']
    }]:
        try:
            d = Docket.objects.get(court_id=pq.court_id, **kwargs)
            break
        except Docket.DoesNotExist:
            continue
        except Docket.MultipleObjectsReturned:
            msg = "Too many dockets found when trying to look up '%s'" % pq
            mark_pq_status(pq, msg, pq.PROCESSING_FAILED)
            return None

    if d is None:
        # Couldn't find it. Make a new one.
        d = Docket(source=Docket.RECAP,
                   pacer_case_id=pq.pacer_case_id,
                   court_id=pq.court_id)

    # Add RECAP as a source if it's not already.
    if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
        d.source = Docket.RECAP_AND_SCRAPER
    elif d.source == Docket.COLUMBIA:
        d.source = Docket.COLUMBIA_AND_RECAP
    elif d.source == Docket.COLUMBIA_AND_SCRAPER:
        d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER

    update_docket_metadata(d, docket_data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        return d

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    # Docket entries
    for docket_entry in docket_data['docket_entries']:
        try:
            de, created = DocketEntry.objects.update_or_create(
                docket=d,
                entry_number=docket_entry['document_number'],
                defaults={
                    'description': docket_entry['description'],
                    'date_filed': docket_entry['date_filed'],
                })
        except DocketEntry.MultipleObjectsReturned:
            logger.error(
                "Multiple docket entries found for document entry number '%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue

        # Then make the RECAPDocument object. Try to find it. If we do, update
        # the pacer_doc_id field if it's blank. If we can't find it, create it
        # or throw an error.
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=de,
                # No attachments when uploading dockets.
                document_type=RECAPDocument.PACER_DOCUMENT,
                document_number=docket_entry['document_number'],
            )
        except RECAPDocument.DoesNotExist:
            try:
                RECAPDocument.objects.create(
                    docket_entry=de,
                    # No attachments when uploading dockets.
                    document_type=RECAPDocument.PACER_DOCUMENT,
                    document_number=docket_entry['document_number'],
                    pacer_doc_id=docket_entry['pacer_doc_id'],
                    is_available=False,
                )
            except IntegrityError:
                logger.warn(
                    "Creating new document with pacer_doc_id of '%s' violates "
                    "unique constraint on pacer_doc_id field." %
                    docket_entry['pacer_doc_id'])
                continue
        except RECAPDocument.MultipleObjectsReturned:
            logger.error(
                "Multiple recap documents found for document entry number'%s' "
                "while processing '%s'" %
                (docket_entry['document_number'], pq))
            continue
        else:
            rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id

    add_parties_and_attorneys(d, docket_data['parties'])
    mark_pq_successful(pq, d_id=d.pk)
    return d