Пример #1
0
def fetch_pacer_doc_by_rd(self, rd_pk, fq_pk):
    """Fetch a PACER PDF by rd_pk

    This is very similar to get_pacer_doc_by_rd, except that it manages
    status as it proceeds and it gets the cookie info from redis.

    :param rd_pk: The PK of the RECAP Document to get.
    :param fq_pk: The PK of the RECAP Fetch Queue to update.
    :return: The RECAPDocument PK
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    fq = PacerFetchQueue.objects.get(pk=fq_pk)
    mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS)

    if rd.is_available:
        msg = "PDF already marked as 'is_available'. Doing nothing."
        mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
        self.request.chain = None
        return

    cookies = get_pacer_cookie_from_cache(fq.user_id)
    if not cookies:
        msg = "Unable to find cached cookies. Aborting request."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    pacer_case_id = rd.docket_entry.docket.pacer_case_id
    try:
        r = download_pacer_pdf_by_rd(
            rd.pk, pacer_case_id, rd.pacer_doc_id, cookies
        )
    except (requests.RequestException, HTTPError):
        msg = "Failed to get PDF from network."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    court_id = rd.docket_entry.docket.court_id
    success, msg = update_rd_metadata(
        self,
        rd_pk,
        r,
        court_id,
        pacer_case_id,
        rd.pacer_doc_id,
        rd.document_number,
        rd.attachment_number,
    )

    if success is False:
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    return rd.pk
Пример #2
0
def fetch_attachment_page(self, fq_pk):
    """Fetch a PACER attachment page by rd_pk

    This is very similar to process_recap_attachment, except that it manages
    status as it proceeds and it gets the cookie info from redis.

    :param fq_pk: The PK of the RECAP Fetch Queue to update.
    :return: None
    """
    fq = PacerFetchQueue.objects.get(pk=fq_pk)
    mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS)

    rd = fq.recap_document
    if not rd.pacer_doc_id:
        msg = (
            "Unable to get attachment page: Unknown pacer_doc_id for "
            "RECAP Document object %s" % rd.pk
        )
        mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO)
        return

    cookies = get_pacer_cookie_from_cache(fq.user_id)
    if not cookies:
        msg = "Unable to find cached cookies. Aborting request."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        return

    try:
        r = get_attachment_page_by_rd(rd.pk, cookies)
    except (requests.RequestException, HTTPError):
        msg = "Failed to get attachment page from network."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        return

    text = r.response.text
    att_data = get_data_from_att_report(text, rd.docket_entry.docket.court_id,)

    if att_data == {}:
        msg = "Not a valid attachment page upload"
        mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        return

    try:
        merge_attachment_page_data(
            rd.docket_entry.docket.court,
            rd.docket_entry.docket.pacer_case_id,
            att_data["pacer_doc_id"],
            att_data["document_number"],
            text,
            att_data["attachments"],
        )
    except RECAPDocument.MultipleObjectsReturned:
        msg = (
            "Too many documents found when attempting to associate "
            "attachment data"
        )
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        return
    except RECAPDocument.DoesNotExist as exc:
        msg = "Could not find docket to associate with attachment metadata"
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            return
        mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
        raise self.retry(exc=exc)
    msg = "Successfully completed fetch and save."
    mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
Пример #3
0
def fetch_docket(self, fq_pk):
    """Fetch a docket from PACER

    This mirrors code elsewhere that gets dockets, but manages status as it
    goes through the process.

    :param fq_pk: The PK of the RECAP Fetch Queue to update.
    :return: None
    """
    fq = PacerFetchQueue.objects.get(pk=fq_pk)
    mark_pq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS)

    cookies = get_pacer_cookie_from_cache(fq.user_id)
    if cookies is None:
        msg = (
            "Cookie cache expired before task could run for user: %s"
            % fq.user_id
        )
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)

    court_id = fq.court_id or getattr(fq.docket, "court_id", None)
    s = PacerSession(cookies=cookies)

    try:
        result = fetch_pacer_case_id_and_title(s, fq, court_id)
    except (requests.RequestException, ReadTimeoutError) as exc:
        msg = "Network error getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except PacerLoginException as exc:
        msg = "PacerLoginException while getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except ParsingException:
        msg = "Unable to parse pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    # result can be one of three values:
    #   None       --> Sealed or missing case
    #   Empty dict --> Didn't run the pacer_case_id lookup (wasn't needed)
    #   Full dict  --> Ran the query, got back results
    if result is None:
        msg = "Cannot find case by docket number (perhaps it's sealed?)"
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    pacer_case_id = getattr(fq.docket, "pacer_case_id", None) or result.get(
        "pacer_case_id"
    )

    if not pacer_case_id:
        msg = "Unable to determine pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    try:
        result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq,)
    except (requests.RequestException, ReadTimeoutError) as exc:
        msg = "Network error getting pacer_case_id for fq: %s."
        if self.request.retries == self.max_retries:
            mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
            self.request.chain = None
            return None
        mark_fq_status(
            fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY
        )
        raise self.retry(exc=exc)
    except ParsingException:
        msg = "Unable to parse pacer_case_id for docket."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return None

    msg = "Successfully got and merged docket. Adding to Solr as final step."
    mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
    return result
Пример #4
0
def fetch_pacer_doc_by_rd(self, rd_pk: int, fq_pk: int) -> Optional[int]:
    """Fetch a PACER PDF by rd_pk

    This is very similar to get_pacer_doc_by_rd, except that it manages
    status as it proceeds and it gets the cookie info from redis.

    :param rd_pk: The PK of the RECAP Document to get.
    :param fq_pk: The PK of the RECAP Fetch Queue to update.
    :return: The RECAPDocument PK
    """
    rd = RECAPDocument.objects.get(pk=rd_pk)
    fq = PacerFetchQueue.objects.get(pk=fq_pk)
    mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS)

    if rd.is_available:
        msg = "PDF already marked as 'is_available'. Doing nothing."
        mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
        self.request.chain = None
        return

    if not rd.pacer_doc_id:
        msg = (
            "Missing 'pacer_doc_id' attribute. Without this attribute we "
            "cannot identify the document properly. Missing pacer_doc_id "
            "attributes usually indicate that the item may not have a "
            "document associated with it, or it may need to be updated via "
            "the docket report to acquire a pacer_doc_id. Aborting request.")
        mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT)
        self.request.chain = None
        return

    cookies = get_pacer_cookie_from_cache(fq.user_id)
    if not cookies:
        msg = "Unable to find cached cookies. Aborting request."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    pacer_case_id = rd.docket_entry.docket.pacer_case_id
    try:
        r = download_pacer_pdf_by_rd(rd.pk, pacer_case_id, rd.pacer_doc_id,
                                     cookies)
    except (requests.RequestException, HTTPError):
        msg = "Failed to get PDF from network."
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    court_id = rd.docket_entry.docket.court_id
    success, msg = update_rd_metadata(
        self,
        rd_pk,
        r,
        court_id,
        pacer_case_id,
        rd.pacer_doc_id,
        rd.document_number,
        rd.attachment_number,
    )

    if success is False:
        mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED)
        self.request.chain = None
        return

    return rd.pk