示例#1
0
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options['queue']
    index = options['index']
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk')
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
        chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session, row.pk,
                                  index=index).set(queue=q),
            delete_pacer_row.si(row.pk).set(queue=q),
        ).apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info("Sent %s/%s tasks to celery for %s so "
                        "far." % (completed, count, task_name))
def get_pacer_dockets(options, row_pks, tag=None):
    """Get the pacer dockets identified by the FJC IDB rows"""
    q = options['queue']
    throttle = CeleryThrottle(queue_name=q)
    for i, row_pk in enumerate(row_pks):
        if i >= options['count'] > 0:
            break
        throttle.maybe_wait()
        if i % 1000 == 0:
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()
            logger.info("Sent %s tasks to celery so far." % i)
        row = FjcIntegratedDatabase.objects.get(pk=row_pk)
        chain(
            get_docket_by_pacer_case_id.s(
                row.pacer_case_id, row.district_id, pacer_session, **{
                    'tag': tag,
                    'show_parties_and_counsel': True,
                    'show_terminated_parties': True,
                    'show_list_of_member_cases': True
                }).set(queue=q),
            add_or_update_recap_docket.s().set(queue=q),
        ).apply_async()
def open_session(username, password):
    pacer_sess = PacerSession(username=username, password=password)
    return pacer_sess
示例#4
0
 def setUpClass(cls):
     pacer_session = PacerSession(username='******', password='******')
     cls.report = DocketReport('psc', pacer_session)
     cls.pacer_case_id = '62866'  # 1:07-cr-00001-RJA-HKS USA v. Green
示例#5
0
 def setUp(self):
     self.session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
示例#6
0
 def setUpClass(cls):
     if PACER_USERNAME and PACER_PASSWORD:
         cls.pacer_session = PacerSession(username=PACER_USERNAME,
                                          password=PACER_PASSWORD)
         cls.report = ShowCaseDocApi('dcd', cls.pacer_session)
示例#7
0
 def test_logging_in_bad_credentials(self):
     session = PacerSession(username='******', password='******')
     with self.assertRaises(PacerLoginException):
         session.login()
示例#8
0
 def setUp(self):
     self.session = PacerSession()
示例#9
0
def get_and_save_free_document_reports(options):
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later. For now
    just get the list.

    Note that this uses synchronous celery chains. A previous version was more
    complex and did not use synchronous chains. Unfortunately in Celery 4.2.0,
    or more accurately in redis-py 3.x.x, doing it that way failed nearly every
    time.

    This is a simpler version, though a slower one, but it should get the job
    done.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    three_hrs_ago = now() - timedelta(hours=3)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=three_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, )

    cl_court_ids = Court.objects.filter(
        jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY],
        in_use=True,
        end_date=None,
    ).exclude(pk__in=['casb', 'gub', 'innb', 'miwb', 'ohsb',
                      'prb'], ).values_list(
                          'pk',
                          flat=True,
                      )
    pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids]

    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()

    today = now()
    for pacer_court_id in pacer_court_ids:
        while True:
            next_start_d, next_end_d = get_next_date_range(pacer_court_id)
            logger.info(
                "Attempting to get latest document references for "
                "%s between %s and %s", pacer_court_id, next_start_d,
                next_end_d)
            mark_court_in_progress(pacer_court_id, next_end_d)
            try:
                status = get_and_save_free_document_report(
                    pacer_court_id, next_start_d, next_end_d,
                    pacer_session.cookies)
            except RequestException:
                logger.error(
                    "Failed to get document references for %s "
                    "between %s and %s due to network error.", pacer_court_id,
                    next_start_d, next_end_d)
                mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED,
                                        pacer_court_id, next_end_d)
                break
            except IndexError:
                logger.error(
                    "Failed to get document references for %s "
                    "between %s and %s due to PACER 6.3 bug.", pacer_court_id,
                    next_start_d, next_end_d)
                mark_court_done_on_date(PACERFreeDocumentLog.SCRAPE_FAILED,
                                        pacer_court_id, next_end_d)
                break
            else:
                result = mark_court_done_on_date(status, pacer_court_id,
                                                 next_end_d)

            if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                if next_end_d >= today.date():
                    logger.info("Got all document references for '%s'.",
                                pacer_court_id)
                    # Break from while loop, onwards to next court
                    break
                else:
                    # More dates to do; let it continue
                    continue

            elif result == PACERFreeDocumentLog.SCRAPE_FAILED:
                logger.error("Encountered critical error on %s "
                             "(network error?). Marking as failed and "
                             "pressing on." % pacer_court_id)
                # Break from while loop, onwards to next court
                break
示例#10
0
from juriscraper.pacer.http import PacerSession
from juriscraper.pacer.hidden_api import PossibleCaseNumberApi

pacer_sess = PacerSession(username="", password="")

court_id = "dcd"
docket_id = "9000013"
office_id = "1"
docket_param = "cv"

#court_id = "nmi"
#docket_id = "8800001"

api = PossibleCaseNumberApi(court_id=court_id, pacer_session=pacer_sess)

resp = api.query(docket_id)

print resp
print api.data(office_number=office_id, docket_number_letters=docket_param)

#resp = api.query("7801294")
#print res
示例#11
0
 def setUp(self):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     pacer_session.login()
     self.report = DocketReport('cand', pacer_session)
     self.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates
示例#12
0
 def setUpClass(cls):
     pacer_session = PacerSession(username=PACER_USERNAME,
                                  password=PACER_PASSWORD)
     cls.report = DocketReport('cand', pacer_session)
     cls.pacer_case_id = '186730'  # 4:06-cv-07294 Foley v. Bates
示例#13
0
#!/usr/bin/env python
#
#  Takes an .html file on the command line, parses it using the PACER
#  Docket Report parser, and outputs json to stdout.

import jsondate3 as json
import sys

from juriscraper.pacer.http import PacerSession
from juriscraper.pacer import DocketReport

pacer_session = PacerSession(username="******", password="******")
report = DocketReport("psc", pacer_session)

for path in sys.argv[1:]:
    with open(path, "r") as f:
        report._parse_text(f.read().decode("utf-8"))
    data = report.data
    print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
示例#14
0
def get_and_save_free_document_reports(options):
    """Query the Free Doc Reports on PACER and get a list of all the free
    documents. Do not download those items, as that step is done later.
    """
    # Kill any *old* logs that report they're in progress. (They've failed.)
    twelve_hrs_ago = now() - timedelta(hours=12)
    PACERFreeDocumentLog.objects.filter(
        date_started__lt=twelve_hrs_ago,
        status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS,
    ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED, )

    cl_court_ids = Court.objects.filter(
        jurisdiction__in=[Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY],
        in_use=True,
        end_date=None,
    ).exclude(pk__in=[
        'casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb',
        'prb', 'tnwb', 'vib'
    ], ).values_list(
        'pk',
        flat=True,
    )
    pacer_court_ids = {
        map_cl_to_pacer_id(v): {
            'until': now(),
            'count': 1,
            'result': None
        }
        for v in cl_court_ids
    }
    pacer_session = PacerSession(username=PACER_USERNAME,
                                 password=PACER_PASSWORD)
    pacer_session.login()

    # Iterate over every court, X days at a time. As courts are completed,
    # remove them from the list of courts to process until none are left
    tomorrow = now() + timedelta(days=1)
    while len(pacer_court_ids) > 0:
        court_ids_copy = pacer_court_ids.copy()  # Make a copy of the list.
        for pacer_court_id, delay in court_ids_copy.items():
            if now() < delay['until']:
                # Do other courts until the delay is up. Do not print/log
                # anything since at the end there will only be one court left.
                continue

            next_start_date, next_end_date = get_next_date_range(
                pacer_court_id)
            if delay['result'] is not None:
                if delay['result'].ready():
                    result = delay['result'].get()
                    if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL:
                        if next_start_date >= tomorrow.date():
                            logger.info("Finished '%s'. Marking it complete." %
                                        pacer_court_id)
                            pacer_court_ids.pop(pacer_court_id, None)
                            continue

                    elif result == PACERFreeDocumentLog.SCRAPE_FAILED:
                        logger.error("Encountered critical error on %s "
                                     "(network error?). Marking as failed and "
                                     "pressing on." % pacer_court_id)
                        pacer_court_ids.pop(pacer_court_id, None)
                        continue
                else:
                    next_delay = min(delay['count'] * 5, 30)  # backoff w/cap
                    logger.info(
                        "Court %s still in progress. Delaying at least "
                        "%ss." % (pacer_court_id, next_delay))
                    pacer_court_ids[pacer_court_id]['until'] = now(
                    ) + timedelta(seconds=next_delay)
                    pacer_court_ids[pacer_court_id]['count'] += 1
                    continue

            mark_court_in_progress(pacer_court_id, next_end_date)
            pacer_court_ids[pacer_court_id]['count'] = 1  # Reset
            delay['result'] = chain(
                get_and_save_free_document_report.si(pacer_court_id,
                                                     next_start_date,
                                                     next_end_date,
                                                     pacer_session),
                mark_court_done_on_date.s(pacer_court_id, next_end_date),
            ).apply_async()
示例#15
0
def get_pacer_session():
    return PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
示例#16
0
 def test_logging_in_bad_credentials(self):
     # Make sure password is more than eight characters.
     session = PacerSession(username="******", password="******")
     with self.assertRaises(PacerLoginException):
         session.login()
def fetch(ctx, overwrite=False):
    print('fetch')
    session = PacerSession(username=os.environ.get('PACER_USERNAME'),
                           password=os.environ.get('PACER_PASSWORD'))
    today = date.today().strftime('%m/%d/%Y')
    citations = [
        '18:922A.F',
        '18:922C.F',
        '18:922E.F',
        '18:922G.F',
        '18:924A.F',
        '18:924C.F',
    ]
    for citation in citations:
        outputfile = 'data/{0}.tsv'.format(citation)

        if overwrite or not os.path.exists(outputfile):
            body = {
                "office": (None, ""),
                "case_type": (None, ""),
                "case_flags": (None, ""),
                "citation": (None, citation),
                "pending_citations": (None, "1"),
                "terminated_citations": (None, "1"),
                "cvbcases": (None, "No"),
                "filed_from": (None, "1/1/2007"),
                "filed_to": (None, today),
                "terminal_digit": (None, ""),
                "pending_defendants": (None, "on"),
                "terminated_defendants": (None, "on"),
                "fugitive_defendants": (None, ""),
                "nonfugitive_defendants": (None, "1"),
                "reportable_cases": (None, "1"),
                "non_reportable_cases": (None, "1"),
                "sort1": (None, "case number"),
                "sort2": (None, ""),
                "sort3": (None, ""),
                "format": (None, "data")
            }
            intermediate_resp = session.post(
                'https://ecf.ilnd.uscourts.gov/cgi-bin/CrCaseFiled-Rpt.pl?1-L_1_0-1'
                .format(randint(200000, 40000000)),
                files=body)

            intermediate_doc = BeautifulSoup(intermediate_resp.content, 'lxml')
            form = intermediate_doc.find('form')
            action = form.attrs.get('action')
            action_path = action.split('/')[-1]
            url = 'https://ecf.ilnd.uscourts.gov/cgi-bin/' + action_path

            resp = session.post(url)

            print('-' * 50)
            print(citation)
            print('-' * 50)
            print(resp.content)

            with open(outputfile, 'w') as f:
                f.write(resp.content)

        else:
            print('skipped {0}'.format(citation))