示例#1
0
 def parse(responses):
     """Using a list of responses, parse out useful information and return
     it as a list of dicts.
     """
     results = []
     court_id = "Court not yet set."
     for response in responses:
         response.raise_for_status()
         court_id = get_court_id_from_url(response.url)
         set_response_encoding(response)
         text = clean_html(response.text)
         tree = get_html_parsed_text(text)
         tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url)
         opinion_count = int(
             tree.xpath('//b[contains(text(), "Total number of '
                        'opinions reported")]')[0].tail)
         if opinion_count == 0:
             continue
         rows = tree.xpath('(//table)[1]//tr[position() > 1]')
         for row in rows:
             if results:
                 # If we have results already, pass the previous result to
                 # the FreeOpinionRow object.
                 row = FreeOpinionRow(row, results[-1], court_id)
             else:
                 row = FreeOpinionRow(row, {}, court_id)
             results.append(row)
     logger.info("Parsed %s results from written opinions report at %s" %
                 (len(results), court_id))
     return results
示例#2
0
    def test_extract_written_documents_report(self):
        """Do all the written reports work?"""

        for court in self.courts:
            if court['type'] == "U.S. Courts of Appeals":
                continue
            court_id = get_court_id_from_url(court['court_link'])

            if court_id not in self.valid_dates:
                continue

            results = []
            report = self.reports[court_id]
            some_date = convert_date_string(self.valid_dates[court_id])
            retry_count = 1
            max_retries = 5  # We'll try five times total
            while not results and retry_count <= max_retries:
                # This loop is sometimes needed to find a date with documents.
                # In general the valid dates json object should suffice,
                # however.
                if some_date > date.today():
                    raise ValueError("Runaway date query for %s: %s" %
                                     (court_id, some_date))
                try:
                    report.query(some_date, some_date, sort='case_number')
                except ConnectionError as e:
                    if retry_count <= max_retries:
                        print("%s. Trying again (%s of %s)" %
                              (e, retry_count, max_retries))
                        time.sleep(10)  # Give the server a moment of rest.
                        retry_count += 1
                        continue
                    else:
                        print("%s: Repeated errors at this court." % e)
                        raise e
                if not report.responses:
                    break  # Not a supported court.
                some_date += timedelta(days=1)

            else:
                # While loop ended normally (without hitting break)
                for result in results:
                    for k, v in result.items():
                        if k in ['nature_of_suit', 'cause']:
                            continue
                        self.assertIsNotNone(
                            v,
                            msg="Value of key %s is None in court %s" %
                                (k, court_id)
                        )

                # Can we download one item from each court?
                r = report.download_pdf(results[0]['pacer_case_id'],
                                        results[0]['pacer_doc_id'])
                if r is None:
                    # Extremely messed up download.
                    continue
                self.assertEqual(r.headers['Content-Type'], 'application/pdf')
    def test_extract_written_documents_report(self):
        """Do all the written reports work?"""

        for court in self.courts:
            if court["type"] == "U.S. Courts of Appeals":
                continue
            court_id = get_court_id_from_url(court["court_link"])

            if court_id not in self.valid_dates:
                continue

            results = []
            report = self.reports[court_id]
            some_date = convert_date_string(self.valid_dates[court_id])
            retry_count = 1
            max_retries = 5  # We'll try five times total
            while not results and retry_count <= max_retries:
                # This loop is sometimes needed to find a date with documents.
                # In general the valid dates json object should suffice,
                # however.
                if some_date > date.today():
                    raise ValueError("Runaway date query for %s: %s" %
                                     (court_id, some_date))
                try:
                    report.query(some_date, some_date, sort="case_number")
                except ConnectionError as e:
                    if retry_count <= max_retries:
                        print("%s. Trying again (%s of %s)" %
                              (e, retry_count, max_retries))
                        time.sleep(10)  # Give the server a moment of rest.
                        retry_count += 1
                        continue
                    else:
                        print("%s: Repeated errors at this court." % e)
                        raise e
                if not report.responses:
                    break  # Not a supported court.
                some_date += timedelta(days=1)

            else:
                # While loop ended normally (without hitting break)
                for result in results:
                    for k, v in result.items():
                        if k in ["nature_of_suit", "cause"]:
                            continue
                        self.assertIsNotNone(
                            v,
                            msg="Value of key %s is None in court %s" %
                            (k, court_id),
                        )

                # Can we download one item from each court?
                r = report.download_pdf(results[0]["pacer_case_id"],
                                        results[0]["pacer_doc_id"])
                if r is None:
                    # Extremely messed up download.
                    continue
                self.assertEqual(r.headers["Content-Type"], "application/pdf")
示例#4
0
    def setUpClass(cls):
        pacer_session = PacerSession()

        if PACER_USERNAME and PACER_PASSWORD:
            # CAND chosen at random
            pacer_session = login('cand', PACER_USERNAME, PACER_PASSWORD)

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            cls.courts = get_courts_from_json(json.load(j))

        with open(os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')) as j:
            cls.valid_dates = json.load(j)

        cls.reports = {}
        for court in cls.courts:
            court_id = get_court_id_from_url(court['court_link'])
            cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
示例#5
0
    def setUpClass(cls):
        pacer_session = PacerSession()

        if PACER_USERNAME and PACER_PASSWORD:
            # CAND chosen at random
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            cls.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')
        with open(path) as j:
            cls.valid_dates = json.load(j)

        cls.reports = {}
        for court in cls.courts:
            court_id = get_court_id_from_url(court['court_link'])
            cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
示例#6
0
    def setUp(self):
        pacer_session = PacerSession()

        if pacer_credentials_are_defined():
            # CAND chosen at random
            pacer_session = get_pacer_session()
            pacer_session.login()

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            self.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT_EXAMPLES_PACER,
                            'dates/valid_free_opinion_dates.json')
        with open(path) as j:
            self.valid_dates = json.load(j)

        self.reports = {}
        for court in self.courts:
            court_id = get_court_id_from_url(court['court_link'])
            self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
示例#7
0
    def setUp(self):
        pacer_session = PacerSession()

        if PACER_USERNAME and PACER_PASSWORD:
            # CAND chosen at random
            pacer_session = PacerSession(username=PACER_USERNAME,
                                         password=PACER_PASSWORD)
            pacer_session.login()

        with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j:
            self.courts = get_courts_from_json(json.load(j))

        path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')
        with open(path) as j:
            self.valid_dates = json.load(j)

        self.reports = {}
        for court in self.courts:
            court_id = get_court_id_from_url(court['court_link'])
            self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
示例#8
0
 def test_getting_court_id_from_url(self):
     qa_pairs = (('https://ecf.almd.uscourts.gov/cgi-bin/DktRpt.pl?56120',
                  'almd'), )
     for q, a in qa_pairs:
         self.assertEqual(get_court_id_from_url(q), a)
示例#9
0
 def test_getting_court_id_from_url(self):
     qa_pairs = (
         ('https://ecf.almd.uscourts.gov/cgi-bin/DktRpt.pl?56120', 'almd'),
     )
     for q, a in qa_pairs:
         self.assertEqual(get_court_id_from_url(q), a)