Exemplo n.º 1
0
def get_and_process_pdf(self, data, session, row_pk, index=False):
    if data is None:
        return
    result = data['result']
    rd = RECAPDocument.objects.get(pk=data['rd_pk'])
    report = FreeOpinionReport(data['pacer_court_id'], session)
    try:
        r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id)
    except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError,
            ChunkedEncodingError) as exc:
        logger.warning("Unable to get PDF for %s" % result)
        raise self.retry(exc=exc)
    except HTTPError as exc:
        if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR,
                                        HTTP_504_GATEWAY_TIMEOUT]:
            logger.warning("Ran into HTTPError: %s. Retrying." %
                           exc.response.status_code)
            raise self.retry(exc)
        else:
            msg = "Ran into unknown HTTPError. %s. Aborting." % \
                  exc.response.status_code
            logger.error(msg)
            PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
            self.request.callbacks = None
            return

    if r is None:
        msg = "Unable to get PDF for %s at %s with doc id %s" % \
              (result, result.court_id, result.pacer_doc_id)
        logger.error(msg)
        PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg)
        self.request.callbacks = None
        return

    file_name = get_document_filename(
        result.court.pk,
        result.pacer_case_id,
        result.document_number,
        0,  # Attachment number is zero for all free opinions.
    )
    cf = ContentFile(r.content)
    rd.filepath_local.save(file_name, cf, save=False)
    rd.is_available = True  # We've got the PDF.

    # request.content is sometimes a str, sometimes unicode, so
    # force it all to be bytes, pleasing hashlib.
    rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest()
    rd.is_free_on_pacer = True
    rd.page_count = get_page_count(rd.filepath_local.path, 'pdf')

    # Save and extract, skipping OCR.
    rd.save(do_extraction=False, index=index)
    extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False)
    return {'result': result, 'rd_pk': rd.pk}
Exemplo n.º 2
0
def get_and_save_free_document_report(self, court_id, start, end, session):
    """Download the Free document report and save it to the DB.
    
    :param self: The Celery task.
    :param court_id: A pacer court id.
    :param start: a date object representing the first day to get results.
    :param end: a date object representing the last day to get results.
    :param session: A PACER Session object
    :return: None
    """
    report = FreeOpinionReport(court_id, session)
    try:
        responses = report.query(start, end, sort='case_number')
    except (ConnectionError, ChunkedEncodingError, ReadTimeoutError,
            ReadTimeout, ConnectTimeout) as exc:
        logger.warning("Unable to get free document report results from %s "
                       "(%s to %s). Trying again." % (court_id, start, end))
        if self.request.retries == self.max_retries:
            return PACERFreeDocumentLog.SCRAPE_FAILED
        raise self.retry(exc=exc, countdown=10)

    try:
        results = report.parse(responses)
    except (IndexError, HTTPError) as exc:
        # IndexError: When the page isn't downloaded properly.
        # HTTPError: raise_for_status in parse hit bad status.
        if self.request.retries == self.max_retries:
            return PACERFreeDocumentLog.SCRAPE_FAILED
        raise self.retry(exc=exc, countdown=10)

    for row in results:
        try:
            PACERFreeDocumentRow.objects.create(
                court_id=row.court_id,
                pacer_case_id=row.pacer_case_id,
                docket_number=row.docket_number,
                case_name=row.case_name,
                date_filed=row.date_filed,
                pacer_doc_id=row.pacer_doc_id,
                document_number=row.document_number,
                description=row.description,
                nature_of_suit=row.nature_of_suit,
                cause=row.cause,
             )
        except IntegrityError:
            # Duplicate for whatever reason.
            continue

    return PACERFreeDocumentLog.SCRAPE_SUCCESSFUL
Exemplo n.º 3
0
def get_free_document_report(self, court_id, start, end, session):
    """Get structured results from the PACER free document report"""
    report = FreeOpinionReport(court_id, session)
    try:
        responses = report.query(start, end, sort='case_number')
    except (ConnectionError, ChunkedEncodingError, ReadTimeoutError,
            ConnectTimeout) as exc:
        logger.warning("Unable to get free document report results from %s "
                       "(%s to %s). Trying again." % (court_id, start, end))
        raise self.retry(exc=exc, countdown=5)

    try:
        return report.parse(responses)
    except IndexError as exc:
        # Happens when the page isn't downloaded properly, ugh.
        raise self.retry(exc=exc, countdown=15)
Exemplo n.º 4
0
    def setUpClass(cls):
        pacer_session = PacerSession()

        if PACER_USERNAME and PACER_PASSWORD:
            # CAND chosen at random
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            cls.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT,
                            'fixtures/valid_free_opinion_dates.json')
        with open(path) as j:
            cls.valid_dates = json.load(j)

        cls.reports = {}
        for court in cls.courts:
            court_id = get_court_id_from_url(court['court_link'])
            cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
    def setUp(self):
        pacer_session = PacerSession()

        if pacer_credentials_are_defined():
            # CAND chosen at random
            pacer_session = get_pacer_session()
            pacer_session.login()

        with open(os.path.join(JURISCRAPER_ROOT, "pacer/courts.json")) as j:
            self.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT_EXAMPLES_PACER,
                            "dates/valid_free_opinion_dates.json")
        with open(path) as j:
            self.valid_dates = json.load(j)

        self.reports = {}
        for court in self.courts:
            court_id = get_court_id_from_url(court["court_link"])
            self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)