示例#1
0
def fetch_docket_by_pacer_case_id(
    session, court_id, pacer_case_id, fq,
):
    """Download the docket from PACER and merge it into CL

    :param session: A PacerSession object to work with
    :param court_id: The CL ID of the court
    :param pacer_case_id: The pacer_case_id of the docket, if known
    :param fq: The PacerFetchQueue object
    :return: a dict with information about the docket and the new data
    """
    report = DocketReport(map_cl_to_pacer_id(court_id), session)
    report.query(pacer_case_id, **get_fq_docket_kwargs(fq))

    docket_data = report.data
    if not docket_data:
        raise ParsingException("No data found in docket report.")
    if fq.docket_id:
        d = Docket.objects.get(pk=fq.docket_id)
    else:
        d, count = find_docket_object(
            court_id, pacer_case_id, docket_data["docket_number"]
        )
        if count > 1:
            d = d.earliest("date_created")
    rds_created, content_updated = merge_pacer_docket_into_cl_docket(
        d, pacer_case_id, docket_data, report, appellate=False,
    )
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
示例#2
0
 def setUp(self):
     self.docket, count = find_docket_object("akd", "41664",
                                             "3:11-cv-00064")
     if count > 1:
         raise Exception("Should not get more than one docket during "
                         "this test!")
     process_docket_data(self.docket, self.DOCKET_PATH,
                         UPLOAD_TYPE.IA_XML_FILE)
示例#3
0
 def setUp(self) -> None:
     docket_number = "3:11-cv-00064"
     self.docket = find_docket_object("akd", "41664", docket_number)
     self.docket.filepath_local = (
         "/test/xml/gov.uscourts.akd.41664.docket.xml")
     self.docket.docket_number = docket_number
     self.docket.save()
     process_docket_data(self.docket, UPLOAD_TYPE.IA_XML_FILE)
示例#4
0
def merge_rss_feed_contents(feed_data, court_pk, feed_status_pk):
    """Merge the rss feed contents into CourtListener

    :param feed_data: The data parameter of a PacerRssFeed object that has
    already queried the feed and been parsed.
    :param court_pk: The CourtListener court ID.
    :param feed_status_pk: The CL ID for the RSS status object.
    :returns all_rds_created: A list of all the RDs created during the
    processing.
    """
    start_time = now()
    feed_status = RssFeedStatus.objects.get(pk=feed_status_pk)

    # RSS feeds are a list of normal Juriscraper docket objects.
    all_rds_created = []
    d_pks_to_alert = []
    for docket in feed_data:
        item_hash = hash_item(docket)
        if is_cached(item_hash):
            continue

        with transaction.atomic():
            cached_ok = cache_hash(item_hash)
            if not cached_ok:
                # The item is already in the cache, ergo it's getting processed
                # in another thread/process and we had a race condition.
                continue
            d, docket_count = find_docket_object(court_pk,
                                                 docket["pacer_case_id"],
                                                 docket["docket_number"])
            if docket_count > 1:
                logger.info("Found %s dockets during lookup. Choosing "
                            "oldest." % docket_count)
                d = d.earliest("date_created")

            d.add_recap_source()
            update_docket_metadata(d, docket)
            if not d.pacer_case_id:
                d.pacer_case_id = docket["pacer_case_id"]
            d.save()
            rds_created, content_updated = add_docket_entries(
                d, docket["docket_entries"])

        if content_updated and docket_count > 0:
            newly_enqueued = enqueue_docket_alert(d.pk)
            if newly_enqueued:
                d_pks_to_alert.append((d.pk, start_time))

        all_rds_created.extend([rd.pk for rd in rds_created])

    logger.info(
        "%s: Sending %s new RECAP documents to Solr for indexing and "
        "sending %s dockets for alerts.",
        feed_status.court_id,
        len(all_rds_created),
        len(d_pks_to_alert),
    )
    return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
示例#5
0
    def test_rss_feed_ingestion(self):
        """Can we ingest RSS feeds without creating duplicates?"""
        court_id = 'scotus'
        rss_feed = PacerRssFeed(court_id)
        rss_feed.is_bankruptcy = True  # Needed because we say SCOTUS above.
        with open(self.make_path('rss_sample_unnumbered_mdb.xml')) as f:
            text = f.read().decode('utf-8')
        rss_feed._parse_text(text)
        docket = rss_feed.data[0]
        d, docket_count = find_docket_object(court_id, docket['pacer_case_id'],
                                             docket['docket_number'])
        update_docket_metadata(d, docket)
        d.save()
        self.assertTrue(docket_count == 0)

        expected_count = 1
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
示例#6
0
def process_recap_appellate_docket(self, pk):
    """Process an uploaded appellate docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info(
        "Processing Appellate RECAP item"
        " (debug is: %s): %s" % (pq.debug, pq)
    )

    report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)
    d, og_info = update_docket_appellate_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    if og_info is not None:
        og_info.save()
        d.originating_court_information = og_info
    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET
    )
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
示例#7
0
def process_recap_docket_history_report(self, pk):
    """Process the docket history report.

    :param pk: The primary key of the processing queue item you want to work on
    :returns: A dict indicating whether the docket needs Solr re-indexing.
    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id))
    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed for item %s" % pq)

    if data == {}:
        # Bad docket history page.
        msg = "Not a valid docket history page upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    d.add_recap_source()
    update_docket_metadata(d, data)

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    try:
        d.save()
    except IntegrityError as exc:
        logger.warning(
            "Race condition experienced while attempting docket save."
        )
        error_message = "Unable to save docket due to IntegrityError."
        if self.request.retries == self.max_retries:
            mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        else:
            mark_pq_status(
                pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
            )
            raise self.retry(exc=exc)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT
    )
    pacer_file.filepath.save(
        # We only care about the ext w/UUIDFileSystemStorage
        "docket_history.html",
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data["docket_entries"]
    )
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
示例#8
0
def process_recap_claims_register(self, pk):
    """Merge bankruptcy claims registry HTML into RECAP

    :param pk: The primary key of the processing queue item you want to work on
    :type pk: int
    :return: None
    :rtype: None
    """
    pq = ProcessingQueue.objects.get(pk=pk)
    if pq.debug:
        # Proper debugging not supported on this endpoint. Just abort.
        mark_pq_successful(pq)
        self.request.chain = None
        return None

    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    try:
        text = pq.filepath_local.read().decode("utf-8")
    except IOError as exc:
        msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror)
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    report = ClaimsRegister(map_cl_to_pacer_id(pq.court_id))
    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed for item %s" % pq)

    if not data:
        # Bad HTML
        msg = "Not a valid claims registry page or other parsing failure"
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(
        pq.court_id, pq.pacer_case_id, data["docket_number"]
    )
    if docket_count > 1:
        logger.info(
            "Found %s dockets during lookup. Choosing oldest." % docket_count
        )
        d = d.earliest("date_created")

    # Merge the contents into CL
    d.add_recap_source()
    update_docket_metadata(d, data)

    try:
        d.save()
    except IntegrityError as exc:
        logger.warning(
            "Race condition experienced while attempting docket save."
        )
        error_message = "Unable to save docket due to IntegrityError."
        if self.request.retries == self.max_retries:
            mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        else:
            mark_pq_status(
                pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY
            )
            raise self.retry(exc=exc)

    add_bankruptcy_data_to_docket(d, data)
    add_claims_to_docket(d, data["claims"])
    logger.info("Created/updated claims data for %s", pq)

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(
        content_object=d, upload_type=UPLOAD_TYPE.CLAIMS_REGISTER
    )
    pacer_file.filepath.save(
        # We only care about the ext w/UUIDFileSystemStorage
        "claims_registry.html",
        ContentFile(text),
    )

    mark_pq_successful(pq, d_id=d.pk)
    return {"docket_pk": d.pk}
示例#9
0
def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False):
    """Merge the rss feed contents into CourtListener

    :param self: The Celery task
    :param feed_data: The data parameter of a PacerRssFeed object that has
    already queried the feed and been parsed.
    :param court_pk: The CourtListener court ID.
    :param metadata_only: Whether to only do metadata and skip docket entries.
    :returns Dict containing keys:
      d_pks_to_alert: A list of (docket, alert_time) tuples for sending alerts
      rds_for_solr: A list of RECAPDocument PKs for updating in Solr
    """
    start_time = now()

    # RSS feeds are a list of normal Juriscraper docket objects.
    all_rds_created = []
    d_pks_to_alert = []
    for docket in feed_data:
        item_hash = hash_item(docket)
        if is_cached(item_hash):
            continue

        with transaction.atomic():
            cached_ok = cache_hash(item_hash)
            if not cached_ok:
                # The item is already in the cache, ergo it's getting processed
                # in another thread/process and we had a race condition.
                continue
            d, docket_count = find_docket_object(court_pk,
                                                 docket["pacer_case_id"],
                                                 docket["docket_number"])
            if docket_count > 1:
                logger.info("Found %s dockets during lookup. Choosing "
                            "oldest." % docket_count)
                d = d.earliest("date_created")

            d.add_recap_source()
            update_docket_metadata(d, docket)
            if not d.pacer_case_id:
                d.pacer_case_id = docket["pacer_case_id"]
            try:
                d.save()
                add_bankruptcy_data_to_docket(d, docket)
            except IntegrityError as exc:
                # The docket was created while we looked it up. Retry and it
                # should associate with the new one instead.
                raise self.retry(exc=exc)
            if metadata_only:
                continue

            rds_created, content_updated = add_docket_entries(
                d, docket["docket_entries"])

        if content_updated and docket_count > 0:
            newly_enqueued = enqueue_docket_alert(d.pk)
            if newly_enqueued:
                d_pks_to_alert.append((d.pk, start_time))

        all_rds_created.extend([rd.pk for rd in rds_created])

    logger.info(
        "%s: Sending %s new RECAP documents to Solr for indexing and "
        "sending %s dockets for alerts.",
        court_pk,
        len(all_rds_created),
        len(d_pks_to_alert),
    )
    return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
示例#10
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
    logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))

    try:
        text = pq.filepath_local.read().decode()
    except IOError as exc:
        msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
        if (self.request.retries == self.max_retries) or pq.debug:
            mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
            return None
        else:
            mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
            raise self.retry(exc=exc)

    if "History/Documents" in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.chain = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info(f"Parsing completed of item {pq}")

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d = find_docket_object(pq.court_id, pq.pacer_case_id,
                           data["docket_number"])

    d.add_recap_source()
    update_docket_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {"docket_pk": d.pk, "content_updated": False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        "docket.html",  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(d,
                                                      data["docket_entries"])
    add_parties_and_attorneys(d, data["parties"])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        "docket_pk": d.pk,
        "content_updated": bool(rds_created or content_updated),
    }
示例#11
0
def process_recap_docket(self, pk):
    """Process an uploaded docket from the RECAP API endpoint.

    :param pk: The primary key of the processing queue item you want to work
    on.
    :returns: A dict of the form:

        {
            // The PK of the docket that's created or updated
            'docket_pk': 22,
            // A boolean indicating whether a new docket entry or
            // recap document was created (implying a Solr needs
            // updating).
            'content_updated': True,
        }

    This value is a dict so that it can be ingested in a Celery chain.

    """
    start_time = now()
    pq = ProcessingQueue.objects.get(pk=pk)
    mark_pq_status(pq, '', PROCESSING_STATUS.IN_PROGRESS)
    logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq))

    report = DocketReport(map_cl_to_pacer_id(pq.court_id))
    text = pq.filepath_local.read().decode('utf-8')

    if 'History/Documents' in text:
        # Prior to 1.1.8, we did not separate docket history reports into their
        # own upload_type. Alas, we still have some old clients around, so we
        # need to handle those clients here.
        pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT
        pq.save()
        process_recap_docket_history_report(pk)
        self.request.chain = None
        return None

    report._parse_text(text)
    data = report.data
    logger.info("Parsing completed of item %s" % pq)

    if data == {}:
        # Not really a docket. Some sort of invalid document (see Juriscraper).
        msg = "Not a valid docket upload."
        mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return None

    # Merge the contents of the docket into CL.
    d, docket_count = find_docket_object(pq.court_id, pq.pacer_case_id,
                                         data['docket_number'])
    if docket_count > 1:
        logger.info("Found %s dockets during lookup. Choosing oldest." %
                    docket_count)
        d = d.earliest('date_created')

    d.add_recap_source()
    update_docket_metadata(d, data)
    if not d.pacer_case_id:
        d.pacer_case_id = pq.pacer_case_id

    if pq.debug:
        mark_pq_successful(pq, d_id=d.pk)
        self.request.chain = None
        return {'docket_pk': d.pk, 'content_updated': False}

    d.save()

    # Add the HTML to the docket in case we need it someday.
    pacer_file = PacerHtmlFiles(content_object=d,
                                upload_type=UPLOAD_TYPE.DOCKET)
    pacer_file.filepath.save(
        'docket.html',  # We only care about the ext w/UUIDFileSystemStorage
        ContentFile(text),
    )

    rds_created, content_updated = add_docket_entries(
        d, data['docket_entries'])
    add_parties_and_attorneys(d, data['parties'])
    process_orphan_documents(rds_created, pq.court_id, d.date_filed)
    if content_updated and docket_count > 0:
        newly_enqueued = enqueue_docket_alert(d.pk)
        if newly_enqueued:
            send_docket_alert(d.pk, start_time)
    mark_pq_successful(pq, d_id=d.pk)
    return {
        'docket_pk': d.pk,
        'content_updated': bool(rds_created or content_updated),
    }
示例#12
0
 def setUp(self) -> None:
     self.docket = find_docket_object("akd", "41664", "3:11-cv-00064")
     process_docket_data(self.docket, self.DOCKET_PATH,
                         UPLOAD_TYPE.IA_XML_FILE)