示例#1
0
)
from cl.recap.models import (
    FjcIntegratedDatabase,
    PacerFetchQueue,
    PacerHtmlFiles,
    ProcessingQueue,
    PROCESSING_STATUS,
    REQUEST_TYPE,
    UPLOAD_TYPE,
)
from cl.scrapers.tasks import extract_recap_pdf, get_page_count
from cl.search.models import Docket, DocketEntry, RECAPDocument
from cl.search.tasks import add_or_update_recap_docket, add_items_to_solr

logger = logging.getLogger(__name__)
cnt = CaseNameTweaker()


def process_recap_upload(pq):
    """Process an item uploaded from an extension or API user.

    Uploaded objects can take a variety of forms, and we'll need to
    process them accordingly.
    """
    if pq.upload_type == UPLOAD_TYPE.DOCKET:
        chain(process_recap_docket.s(pq.pk),
              add_or_update_recap_docket.s()).apply_async()
    elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE:
        process_recap_attachment.delay(pq.pk)
    elif pq.upload_type == UPLOAD_TYPE.PDF:
        process_recap_pdf.delay(pq.pk)
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None and not IS_TRAVIS:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
    def test_make_short_name(self):
        test_pairs = [
            # In re and Matter of
            ('In re Lissner', 'In re Lissner'),
            ('Matter of Lissner', 'Matter of Lissner'),

            # Plaintiff is in bad word list
            ('State v. Lissner', 'Lissner'),
            ('People v. Lissner', 'Lissner'),
            ('California v. Lissner', 'Lissner'),
            ('Dallas v. Lissner', 'Lissner'),

            # Basic 3-word case
            ('Langley v. Google', 'Langley'),
            # Similar to above, but more than 3 words
            ('Langley v. Google foo', 'Langley'),

            # United States v. ...
            ('United States v. Lissner', 'Lissner'),

            # Corporate first name
            ('Google, Inc. v. Langley', 'Langley'),
            ('Special, LLC v. Langley', 'Langley'),
            ('Google Corp. v. Langley', 'Langley'),

            # Shorter appellant than plaintiff
            ('Michael Lissner v. Langley', 'Langley'),

            # Multi-v with and w/o a bad_word
            ('Alameda v. Victor v. Keyboard', ''),
            ('Bloggers v. Victor v. Keyboard', ''),

            # Long left, short right
            ('Many words here v. Langley', 'Langley'),

            # Other manually added items
            ('Ilarion v. State', 'Ilarion'),
            ('Imery v. Vangil Ingenieros', 'Imery'),

            # Many more tests from real data!
            ('Bean v. City of Monahans', 'Bean'),
            ('Blanke v. Time, Inc.', 'Blanke'),
            ('New York Life Ins. Co. v. Deshotel', 'Deshotel'),
            ('Deatherage v. Deatherage', 'Deatherage'),
            ('Gonzalez Vargas v. Holder', ''),
            ('Campbell v. Wainwright', 'Campbell'),
            ('Liggett & Myers Tobacco Co. v. Finzer', 'Finzer'),
            ('United States v. Brenes', 'Brenes'),
            ('A.H. Robins Co., Inc. v. Eli Lilly & Co', ''),
            ('McKellar v. Hazen', 'McKellar'),
            ('Gil v. State', 'Gil'),
            ('Fuentes v. Owen', 'Fuentes'),
            ('State v. Shearer', 'Shearer'),
            ('United States v. Smither', 'Smither'),
            ('People v. Bradbury', 'Bradbury'),
            ('Venable (James) v. State', ''),
            ('Burkhardt v. Bailey', 'Burkhardt'),
            ('DeLorenzo v. Bales', 'DeLorenzo'),
            ('Loucks v. Bauman', 'Loucks'),
            ('Kenneth Stern v. Robert Weinstein', ''),
            ('Rayner v. Secretary of Health and Human Services', 'Rayner'),
            ('Rhyne v. Martin', 'Rhyne'),
            ('State v. Wolverton', 'Wolverton'),
            ('State v. Flood', 'Flood'),
            ('Amason v. Natural Gas Pipeline Co.', 'Amason'),
            ('United States v. Bryant', 'Bryant'),
            ('WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA', ''),
            ('Stewart v. Tupperware Corp.', 'Stewart'),
            ('Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY', ''),
            ('Stein v. State Tax Commission', 'Stein'),
            (
                'The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley',
                ''),
            ('People v. Armstrong', 'Armstrong'),
            ('Weeks v. Weeks', 'Weeks'),
            ('Smith v. Xerox Corp.', ''),
            ('In Interest of Ad', ''),
            ('People v. Forsyth', 'Forsyth'),
            ('State v. LeClair', 'LeClair'),
            ('Agristor Credit Corp. v. Unruh', 'Unruh'),
            ('United States v. Larry L. Stewart', ''),
            ('Starling v. United States', 'Starling'),
            ('United States v. Pablo Colin-Molina', ''),
            ('Kenneth N. Juhl v. The United States', ''),
            ('Matter of Wilson', 'Matter of Wilson'),
            ('In Re Damon H.', ''),
            ('Centennial Ins. Co. v. Zylberberg', 'Zylberberg'),
            ('United States v. Donald Lee Stotler', ''),
            ('Byndloss v. State', 'Byndloss'),
            ('People v. Piatkowski', 'Piatkowski'),
            ('United States v. Willie James Morgan', ''),
            ('Harbison (Debra) v. Thieret (James)', ''),
            ('Federal Land Bank of Columbia v. Lieben', 'Lieben'),
            ('John Willard Greywind v. John T. Podrebarac', ''),
            ('State v. Powell', 'Powell'),
            ('Carr v. Galloway', 'Carr'),
            ('Saylors v. State', 'Saylors'),
            ('Jones v. Franke', 'Jones'),
            ('In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde '
             'Titleholders, Inc., a California Corporation', ''),
            ('Pollenex Corporation v. Sunbeam-Home Comfort, a Division of '
             'Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing '
             'Corporation of North America', ''),
            ('Longs v. State', 'Longs'),
            ('Performance Network Solutions v. Cyberklix', 'Cyberklix'),
            ('DiSabatino v. Salicete', 'DiSabatino'),
            ('State v. Jennifer Nicole Jackson', ''),
            ('United States v. Moreno', 'Moreno'),
            ('LOGAN & KANAWHA COAL v. Banque Francaise', ''),
            ('State v. Harrison', 'Harrison'),
            ('Efford v. Milam', 'Efford'),
            ('People v. Thompson', 'Thompson'),
            ('CINCINNATI THERMAL SPRAY v. Pender County', ''),
            ('JAH Ex Rel. RMH v. Wadle & Associates', ''),
            ('United Pub. Employees v. CITY & CTY. OF SAN FRAN.', ''),
            ('Warren v. Massachusetts Indemnity', 'Warren'),
            ('Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             ''),
            ('Snowdon v. Grillo', 'Snowdon'),
            ('Adam Lunsford v. Cravens Funeral Home', ''),
            ('State v. Dillon', 'Dillon'),
            ('In Re Graham', 'In Re Graham'),
            ('Durham v. Chrysler Corp.', ''),  # Fails b/c Durham is a city!
            ('Carolyn Warrick v. Motiva Enterprises, L.L.C', ''),
            ('United States v. Aloi', 'Aloi'),
            ('United States Fidelity & Guaranty v. Graham', 'Graham'),
            ('Wildberger v. Rosenbaum', 'Wildberger'),
            ('Truck Insurance Exchange v. Michling', 'Michling'),
            ('Black Voters v. John J. McDonough', ''),
            ('State of Tennessee v. William F. Cain', ''),
            ('Robert J. Imbrogno v. Defense Logistics Agency', ''),
            ('Leetta Beachum, Administratrix v. Timothy Joseph White', ''),
            ('United States v. Jorge Gonzalez-Villegas', ''),
            ('Pitts v. Florida Bd. of Bar Examiners', 'Pitts'),
            ('State v. Pastushin', 'Pastushin'),
            ('Clark v. Clark', ''),
            ('Barrios v. Holder', 'Barrios'),
            ('Gregory L. Lavin v. United States', ''),
            ('Carpenter v. Consumers Power', 'Carpenter'),
            ('Derbabian v. S & C SNOWPLOWING, INC.', 'Derbabian'),
            ('Bright v. LSI CORP.', 'Bright'),
            ('State v. Brown', 'Brown'),
            ('KENNEY v. Keebler Co.', 'KENNEY'),
            ('Hill v. Chalanor', 'Hill'),
            ('Washington v. New Jersey', ''),
            ('Sollek v. Laseter', 'Sollek'),
            ('United States v. John Handy Jones, International Fidelity '
             'Insurance Company', ''),
            ('N.L.R.B. v. I. W. Corp', ''),
            ('Karpisek v. Cather & Sons Construction, Inc.', 'Karpisek'),
            ('Com. v. Wade', 'Com.'),
            ('Glascock v. Sukumlyn', 'Glascock'),
            ('Burroughs v. Hills', 'Burroughs'),
            ('State v. Darren Matthew Lee', ''),
            ('Mastondrea v. Occidental Hotels Management', 'Mastondrea'),
            ('Kent v. C. I. R', 'Kent'),
            ('Johnson v. City of Detroit', ''),
            ('Nolan v. United States', 'Nolan'),
            ('Currence v. Denver Tramway Corporation', 'Currence'),
            ('Matter of Cano', 'Matter of Cano'),
            # Two words after "Matter of --> Punt."
            ('Matter of Alphabet Soup', ''),
            # Zero words after "Matter of" --> Punt.
            ("Matter of", "Matter of"),
            ('Simmons v. Stalder', 'Simmons'),
            ('United States v. Donnell Hagood', ''),
            ('Kale v. United States INS', 'Kale'),
            ('Cmk v. Department of Revenue Ex Rel. Kb', 'Cmk'),
            ('State Farm Mut. Auto. Ins. Co. v. Barnes', 'Barnes'),
            ('In Re Krp', 'In Re Krp'),
            ('CH v. Department of Children and Families', 'CH'),
            ('Com. v. Monosky', 'Com.'),
            ('JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven', ''),
            ('Carolyn Humphrey v. Memorial Hospitals Association', ''),
            ('Wagner v. Sanders Associates, Inc.', 'Wagner'),
            ('United States v. Venie (Arthur G.)', ''),
            ('Mitchell v. State', ''),
            ('City of Biloxi, Miss. v. Giuffrida', 'Giuffrida'),
            ('Sexton v. St. Clair Federal Sav. Bank', 'Sexton'),
            ('United States v. Matthews', 'Matthews'),
            ('Freeman v. Freeman', 'Freeman'),
            ('Spencer v. Toussaint', 'Spencer'),
            ('In Re Canaday', 'In Re Canaday'),
            ('Wenger v. Commission on Judicial Performance', 'Wenger'),
            ('Jackson v. Janecka', 'Janecka'),
            ('People of Michigan v. Ryan Christopher Smith', ''),
            ('Kincade (Michael) v. State', ''),
            ('Tonubbee v. River Parishes Guide', 'Tonubbee'),
            ('United States v. Richiez', 'Richiez'),
            ('In Re Allamaras', 'In Re Allamaras'),
            ('United States v. Capoccia', 'Capoccia'),
            ('Com. v. DeFranco', 'Com.'),
            ('Matheny v. Porter', 'Matheny'),
            ('Piper v. Hoffman', 'Piper'),
            ('People v. Smith', ''),  # Punted b/c People and Smith are bad.
            ('Mobuary, Joseph v. State.', ''),  # Punted b/c "State." has punct
        ]
        tweaker = CaseNameTweaker()
        for t in test_pairs:
            output = tweaker.make_case_name_short(t[0])
            self.assertEqual(output, t[1],
                             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" %
                             (t[0], t[1], output))
示例#4
0
 def __init__(self, stdout=None, stderr=None, no_color=False):
     super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
     self.cnt = CaseNameTweaker()
示例#5
0
class PacerXMLParser(object):
    """A class to parse a PACER XML file"""

    cnt = CaseNameTweaker()

    def __init__(self, path):
        print "Doing %s" % path
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(self.case_details,
                                                    'docket_num')
        self.pacer_case_id = self.get_str_from_node(self.case_details,
                                                    'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(self.case_details,
                                                      'date_case_filed',
                                                      cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(self.case_details,
                                                            'date_last_filing',
                                                            cast_to_date=True)
        self.case_name = harmonize(
            self.get_str_from_node(self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(self.case_details,
                                                     'nature_of_suit')
        self.jury_demand = self.get_str_from_node(self.case_details,
                                                  'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = self.set_blocked_fields()

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)

    def save(self, debug):
        """Save the item to the database, updating any existing items.

        Returns None if an error occurs.
        """
        required_fields = ['case_name', 'date_filed']
        for field in required_fields:
            if not getattr(self, field):
                print "  Missing required field: %s" % field
                return None

        try:
            d = Docket.objects.get(
                Q(pacer_case_id=self.pacer_case_id)
                | Q(docket_number=self.docket_number),
                court=self.court,
            )
            if d.source == Docket.SCRAPER:
                d.source = Docket.RECAP_AND_SCRAPER
        except Docket.DoesNotExist:
            d = Docket(source=Docket.RECAP, )
        except Docket.MultipleObjectsReturned:
            print "  Got multiple results while attempting save."
            return None

        for attr, v in self.__dict__.items():
            setattr(d, attr, v)

        if not debug:
            d.save()
            print "  Saved as Docket %s: https://www.courtlistener.com%s" % (
                d.pk, d.get_absolute_url())
        return d

    def get_xml_contents(self):
        """Extract the XML from the file on disk and return it as an lxml
        tree
        """
        xml_parser = etree.XMLParser(recover=True)
        tree = etree.parse(self.path, xml_parser)

        return tree

    def get_case_details(self):
        """Most of the details are in the case_details node, so set it aside
        for faster parsing.
        """
        return self.xml.xpath('//case_details')[0]

    def get_document_list(self):
        """Get the XML nodes for the documents"""
        return self.xml.xpath('//document_list/document')

    def get_document_count(self):
        """Get the number of documents associated with this docket."""
        return len(self.document_list)

    def make_documents(self, docket, debug):
        """Parse through the document nodes, making good objects.

        For every node, create a line item on the Docket (a DocketEntry), and
        create 1..n additional RECAPDocuments (attachments or regular documents)
        that are associated with that DocketEntry.

        Returns None if an error occurs.
        """
        for doc_node in self.document_list:
            # Make a DocketEntry object
            entry_number = int(doc_node.xpath('@doc_num')[0])
            attachment_number = int(doc_node.xpath('@attachment_num')[0])
            print "Working on document %s, attachment %s" % (entry_number,
                                                             attachment_number)

            if attachment_number == 0:
                document_type = RECAPDocument.PACER_DOCUMENT
            else:
                document_type = RECAPDocument.ATTACHMENT

            try:
                docket_entry = DocketEntry.objects.get(
                    docket=docket,
                    entry_number=entry_number,
                )
            except DocketEntry.DoesNotExist:
                if document_type == RECAPDocument.PACER_DOCUMENT:
                    docket_entry = DocketEntry(
                        docket=docket,
                        entry_number=entry_number,
                    )
                else:
                    logger.error(
                        "Tried to create attachment without a DocketEntry "
                        "object to associate it with.")
                    continue

            if document_type == RECAPDocument.PACER_DOCUMENT:
                date_filed = (self.get_datetime_from_node(
                    doc_node, 'date_filed', cast_to_date=True)
                              or docket_entry.date_filed)
                docket_entry.date_filed = date_filed
                docket_entry.description = (self.get_str_from_node(
                    doc_node, 'long_desc') or docket_entry.description)
                if not debug:
                    docket_entry.save()

            recap_doc = self.make_recap_document(
                doc_node,
                docket_entry,
                entry_number,
                attachment_number,
                document_type,
                debug,
            )

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            recap_doc = RECAPDocument.objects.get(
                pacer_doc_id=pacer_document_id)
        except RECAPDocument.DoesNotExist:
            recap_doc = RECAPDocument(pacer_doc_id=pacer_document_id,
                                      docket_entry=docket_entry)

        recap_doc.date_upload = self.get_datetime_from_node(
            doc_node, 'upload_date')
        recap_doc.document_type = document_type or recap_doc.document_type
        recap_doc.document_number = entry_number or recap_doc.document_number
        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        recap_doc.is_available = False if availability is None else availability
        recap_doc.sha1 = self.get_str_from_node(doc_node, 'sha1')
        recap_doc.description = (self.get_str_from_node(
            doc_node, 'short_desc') or recap_doc.description)
        if recap_doc.is_available:
            recap_doc.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            recap_doc.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
        if document_type == RECAPDocument.ATTACHMENT:
            recap_doc.attachment_number = attachment_number
        if not debug:
            recap_doc.save()
        return recap_doc

    def get_court(self):
        """Extract the court from the XML and return it as a Court object"""
        court_str = self.case_details.xpath('court/text()')[0].strip()
        try:
            c = Court.objects.get(pk=pacer_to_cl_ids.get(court_str, court_str))
        except Court.DoesNotExist:
            raise ParsingException("Unable to identify court: %s" % court_str)
        else:
            return c

    @staticmethod
    def get_bool_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
            n = int(s)
        except IndexError:
            print "  Couldn't get bool from path: %s" % path
            return None
        except ValueError:
            print(
                "  Couldn't convert text '%s' to int when making boolean "
                "for path: %s" % (s, path))
            return None
        else:
            return bool(n)

    @staticmethod
    def get_str_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            print "  Couldn't get string from path: %s" % path
            return ''  # Return an empty string. Don't return None.
        else:
            return s

    def get_int_from_details(self, node):
        s = self.case_details.xpath('%s/text()' % node)[0].strip()
        try:
            return int(s)
        except ValueError:
            # Can't parse string to int
            print "  Couldn't get int for node %s" % node
            raise ParsingException("Cannot extract int for node %s" % node)

    @staticmethod
    def get_datetime_from_node(node, path, cast_to_date=False):
        """Parse a datetime from the XML located at node."""
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            print "  Couldn't get date from path: %s" % path
            return None
        else:
            d = parser.parse(s)
            d = d.replace(tzinfo=d.tzinfo or gettz('UTC'))  # Set it to UTC.
            if cast_to_date is True:
                return d.date()
            return d

    def get_judges(self, node):
        """Parse out the judge string and then look it up in the DB"""
        try:
            s = self.case_details.xpath('%s/text()' % node)[0].strip()
        except IndexError:
            print "  Couldn't get judge for node: %s" % node
            return None, ''
        else:
            judge_names = find_judge_names(s)
            judges = []
            for judge_name in judge_names:
                judges.append(
                    find_person(judge_name,
                                self.court.pk,
                                case_date=self.date_filed))
            judges = [c for c in judges if c is not None]
            if len(judges) == 0:
                print "  No judges found after lookup."
                logger.info("No judge for: %s" %
                            ((s, self.court.pk, self.date_filed), ))
                return None, s
            elif len(judges) == 1:
                return judges[0], s
            elif len(judges) > 1:
                print "  Too many judges found: %s" % len(judges)
                return None, s

    def set_blocked_fields(self):
        """Set the blocked status for the Docket.

        Dockets are public (blocked is False) when:

                                   Is Bankr. Court
                                +---------+--------+
                                |   YES   |   NO   |
                +---------------+---------+--------+
         Size   | > 500 items   |    X    |    X   |
          of    +---------------+---------+--------+
        Docket  | <= 500 items  |         |    X   |
                +---------------+---------+--------+

        """
        if self.document_count <= 500 and self.court.is_bankruptcy:
            return True, date.today()
        return False, None
示例#6
0
from juriscraper.lib.string_utils import (
    CaseNameTweaker,
    clean_string,
    harmonize,
    titlecase,
)
from lxml import etree

from cl.corpus_importer.court_regexes import state_pairs
from cl.lib.crypto import sha1_of_file
from cl.people_db.lookup_utils import extract_judge_last_name

from .regexes_columbia import FOLDER_DICT, SPECIAL_REGEXES

# initialized once since it takes resources
CASE_NAME_TWEAKER = CaseNameTweaker()

# tags for which content will be condensed into plain text
SIMPLE_TAGS = [
    "reporter_caption",
    "citation",
    "caption",
    "court",
    "docket",
    "posture",
    "date",
    "hearing_date",
    "panel",
    "attorneys",
]
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list("juriscraper")
        num_scrapers = len(
            [s for s in module_strings if "backscraper" not in s])
        max_len_mod_string = (max(
            len(mod)
            for mod in module_strings if "backscraper" not in mod) + 2)
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = ".compare.json"
        json_compare_files_generated = []
        for module_string in module_strings:
            package, module = module_string.rsplit(".", 1)
            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            if "backscraper" not in module_string:
                sys.stdout.write("  %s " %
                                 module_string.ljust(max_len_mod_string))
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split(".")
                example_path = os.path.join(
                    "tests",
                    "examples",
                    module_parts[1],
                    "united_states",
                    module_parts[-1],
                )
                paths = [
                    path for path in glob.glob("%s_example*" % example_path)
                    if not path.endswith(json_compare_extension)
                ]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit(".", 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ),
                )
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith("~"):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.enable_test_mode()
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = "%s%s" % (
                        path.rsplit(".", 1)[0],
                        json_compare_extension,
                    )
                    json_data = json.loads(site.to_json(), encoding="utf-8")
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit("/", 1)[1]
                        compare_file = json_path.rsplit("/", 1)[1]
                        with open(json_path, "r") as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string,
                                ),
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        warn_generated_compare_file(json_path)
                        json_compare_files_generated.append(json_path)
                        with open(json_path, "w") as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()
                duration = t2 - t1
                warning_msg = warn_or_crash_slow_parser(t2 - t1)
                if warning_msg:
                    num_warnings += 1

                print("(%s test(s) in %0.1f seconds)" % (num_tests, duration))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,
              ))
        if json_compare_files_generated:
            msg = (
                "Generated compare file(s) during test, please review before proceeding. "
                "If the data looks good, run tests again, then be sure to include "
                "the new compare file(s) in your commit: %s")
            self.fail(msg % ", ".join(json_compare_files_generated))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                  "tests. If this is due to a slow scraper you wrote, we "
                  "suggest attempting to speed it up, as it will be slow "
                  "both in production and while running tests. This is "
                  "currently a warning, but may raise a failure in the "
                  "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the "
                  "good work!")
 def test_make_short_name(self):
     test_pairs = [
         # In re and Matter of
         ("In re Lissner", "In re Lissner"),
         ("Matter of Lissner", "Matter of Lissner"),
         # Plaintiff is in bad word list
         ("State v. Lissner", "Lissner"),
         ("People v. Lissner", "Lissner"),
         ("California v. Lissner", "Lissner"),
         ("Dallas v. Lissner", "Lissner"),
         # Basic 3-word case
         ("Langley v. Google", "Langley"),
         # Similar to above, but more than 3 words
         ("Langley v. Google foo", "Langley"),
         # United States v. ...
         ("United States v. Lissner", "Lissner"),
         # Corporate first name
         ("Google, Inc. v. Langley", "Langley"),
         ("Special, LLC v. Langley", "Langley"),
         ("Google Corp. v. Langley", "Langley"),
         # Shorter appellant than plaintiff
         ("Michael Lissner v. Langley", "Langley"),
         # Multi-v with and w/o a bad_word
         ("Alameda v. Victor v. Keyboard", ""),
         ("Bloggers v. Victor v. Keyboard", ""),
         # Long left, short right
         ("Many words here v. Langley", "Langley"),
         # Other manually added items
         ("Ilarion v. State", "Ilarion"),
         ("Imery v. Vangil Ingenieros", "Imery"),
         # Many more tests from real data!
         ("Bean v. City of Monahans", "Bean"),
         ("Blanke v. Time, Inc.", "Blanke"),
         ("New York Life Ins. Co. v. Deshotel", "Deshotel"),
         ("Deatherage v. Deatherage", "Deatherage"),
         ("Gonzalez Vargas v. Holder", ""),
         ("Campbell v. Wainwright", "Campbell"),
         ("Liggett & Myers Tobacco Co. v. Finzer", "Finzer"),
         ("United States v. Brenes", "Brenes"),
         ("A.H. Robins Co., Inc. v. Eli Lilly & Co", ""),
         ("McKellar v. Hazen", "McKellar"),
         ("Gil v. State", "Gil"),
         ("Fuentes v. Owen", "Fuentes"),
         ("State v. Shearer", "Shearer"),
         ("United States v. Smither", "Smither"),
         ("People v. Bradbury", "Bradbury"),
         ("Venable (James) v. State", ""),
         ("Burkhardt v. Bailey", "Burkhardt"),
         ("DeLorenzo v. Bales", "DeLorenzo"),
         ("Loucks v. Bauman", "Loucks"),
         ("Kenneth Stern v. Robert Weinstein", ""),
         ("Rayner v. Secretary of Health and Human Services", "Rayner"),
         ("Rhyne v. Martin", "Rhyne"),
         ("State v. Wolverton", "Wolverton"),
         ("State v. Flood", "Flood"),
         ("Amason v. Natural Gas Pipeline Co.", "Amason"),
         ("United States v. Bryant", "Bryant"),
         ("WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA", ""),
         ("Stewart v. Tupperware Corp.", "Stewart"),
         ("Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY", ""),
         ("Stein v. State Tax Commission", "Stein"),
         (
             "The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley",
             "",
         ),
         ("People v. Armstrong", "Armstrong"),
         ("Weeks v. Weeks", "Weeks"),
         ("Smith v. Xerox Corp.", ""),
         ("In Interest of Ad", ""),
         ("People v. Forsyth", "Forsyth"),
         ("State v. LeClair", "LeClair"),
         ("Agristor Credit Corp. v. Unruh", "Unruh"),
         ("United States v. Larry L. Stewart", ""),
         ("Starling v. United States", "Starling"),
         ("United States v. Pablo Colin-Molina", ""),
         ("Kenneth N. Juhl v. The United States", ""),
         ("Matter of Wilson", "Matter of Wilson"),
         ("In Re Damon H.", ""),
         ("Centennial Ins. Co. v. Zylberberg", "Zylberberg"),
         ("United States v. Donald Lee Stotler", ""),
         ("Byndloss v. State", "Byndloss"),
         ("People v. Piatkowski", "Piatkowski"),
         ("United States v. Willie James Morgan", ""),
         ("Harbison (Debra) v. Thieret (James)", ""),
         ("Federal Land Bank of Columbia v. Lieben", "Lieben"),
         ("John Willard Greywind v. John T. Podrebarac", ""),
         ("State v. Powell", "Powell"),
         ("Carr v. Galloway", "Carr"),
         ("Saylors v. State", "Saylors"),
         ("Jones v. Franke", "Jones"),
         (
             "In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde "
             "Titleholders, Inc., a California Corporation",
             "",
         ),
         (
             "Pollenex Corporation v. Sunbeam-Home Comfort, a Division of "
             "Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing "
             "Corporation of North America",
             "",
         ),
         ("Longs v. State", "Longs"),
         ("Performance Network Solutions v. Cyberklix", "Cyberklix"),
         ("DiSabatino v. Salicete", "DiSabatino"),
         ("State v. Jennifer Nicole Jackson", ""),
         ("United States v. Moreno", "Moreno"),
         ("LOGAN & KANAWHA COAL v. Banque Francaise", ""),
         ("State v. Harrison", "Harrison"),
         ("Efford v. Milam", "Efford"),
         ("People v. Thompson", "Thompson"),
         ("CINCINNATI THERMAL SPRAY v. Pender County", ""),
         ("JAH Ex Rel. RMH v. Wadle & Associates", ""),
         ("United Pub. Employees v. CITY & CTY. OF SAN FRAN.", ""),
         ("Warren v. Massachusetts Indemnity", "Warren"),
         (
             'Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             "",
         ),
         ("Snowdon v. Grillo", "Snowdon"),
         ("Adam Lunsford v. Cravens Funeral Home", ""),
         ("State v. Dillon", "Dillon"),
         ("In Re Graham", "In Re Graham"),
         ("Durham v. Chrysler Corp.", ""),  # Fails b/c Durham is a city!
         ("Carolyn Warrick v. Motiva Enterprises, L.L.C", ""),
         ("United States v. Aloi", "Aloi"),
         ("United States Fidelity & Guaranty v. Graham", "Graham"),
         ("Wildberger v. Rosenbaum", "Wildberger"),
         ("Truck Insurance Exchange v. Michling", "Michling"),
         ("Black Voters v. John J. McDonough", ""),
         ("State of Tennessee v. William F. Cain", ""),
         ("Robert J. Imbrogno v. Defense Logistics Agency", ""),
         ("Leetta Beachum, Administratrix v. Timothy Joseph White", ""),
         ("United States v. Jorge Gonzalez-Villegas", ""),
         ("Pitts v. Florida Bd. of Bar Examiners", "Pitts"),
         ("State v. Pastushin", "Pastushin"),
         ("Clark v. Clark", ""),
         ("Barrios v. Holder", "Barrios"),
         ("Gregory L. Lavin v. United States", ""),
         ("Carpenter v. Consumers Power", "Carpenter"),
         ("Derbabian v. S & C SNOWPLOWING, INC.", "Derbabian"),
         ("Bright v. LSI CORP.", "Bright"),
         ("State v. Brown", "Brown"),
         ("KENNEY v. Keebler Co.", "KENNEY"),
         ("Hill v. Chalanor", "Hill"),
         ("Washington v. New Jersey", ""),
         ("Sollek v. Laseter", "Sollek"),
         (
             "United States v. John Handy Jones, International Fidelity "
             "Insurance Company",
             "",
         ),
         ("N.L.R.B. v. I. W. Corp", ""),
         ("Karpisek v. Cather & Sons Construction, Inc.", "Karpisek"),
         ("Com. v. Wade", "Com."),
         ("Glascock v. Sukumlyn", "Glascock"),
         ("Burroughs v. Hills", "Burroughs"),
         ("State v. Darren Matthew Lee", ""),
         ("Mastondrea v. Occidental Hotels Management", "Mastondrea"),
         ("Kent v. C. I. R", "Kent"),
         ("Johnson v. City of Detroit", ""),
         ("Nolan v. United States", "Nolan"),
         ("Currence v. Denver Tramway Corporation", "Currence"),
         ("Matter of Cano", "Matter of Cano"),
         # Two words after "Matter of --> Punt."
         ("Matter of Alphabet Soup", ""),
         # Zero words after "Matter of" --> Punt.
         ("Matter of", "Matter of"),
         ("Simmons v. Stalder", "Simmons"),
         ("United States v. Donnell Hagood", ""),
         ("Kale v. United States INS", "Kale"),
         ("Cmk v. Department of Revenue Ex Rel. Kb", "Cmk"),
         ("State Farm Mut. Auto. Ins. Co. v. Barnes", "Barnes"),
         ("In Re Krp", "In Re Krp"),
         ("CH v. Department of Children and Families", "CH"),
         ("Com. v. Monosky", "Com."),
         ("JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven", ""),
         ("Carolyn Humphrey v. Memorial Hospitals Association", ""),
         ("Wagner v. Sanders Associates, Inc.", "Wagner"),
         ("United States v. Venie (Arthur G.)", ""),
         ("Mitchell v. State", ""),
         ("City of Biloxi, Miss. v. Giuffrida", "Giuffrida"),
         ("Sexton v. St. Clair Federal Sav. Bank", "Sexton"),
         ("United States v. Matthews", "Matthews"),
         ("Freeman v. Freeman", "Freeman"),
         ("Spencer v. Toussaint", "Spencer"),
         ("In Re Canaday", "In Re Canaday"),
         ("Wenger v. Commission on Judicial Performance", "Wenger"),
         ("Jackson v. Janecka", "Janecka"),
         ("People of Michigan v. Ryan Christopher Smith", ""),
         ("Kincade (Michael) v. State", ""),
         ("Tonubbee v. River Parishes Guide", "Tonubbee"),
         ("United States v. Richiez", "Richiez"),
         ("In Re Allamaras", "In Re Allamaras"),
         ("United States v. Capoccia", "Capoccia"),
         ("Com. v. DeFranco", "Com."),
         ("Matheny v. Porter", "Matheny"),
         ("Piper v. Hoffman", "Piper"),
         ("People v. Smith", ""),  # Punted b/c People and Smith are bad.
         ("Mobuary, Joseph v. State.", ""),  # Punted b/c "State." has punct
     ]
     tweaker = CaseNameTweaker()
     for t in test_pairs:
         output = tweaker.make_case_name_short(t[0])
         self.assertEqual(
             output,
             t[1],
             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" %
             (t[0], t[1], output),
         )
示例#9
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = glob.glob('%s_example*' % example_path)
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
示例#10
0
class PacerXMLParser(object):
    """A class to parse a PACER XML file"""

    cnt = CaseNameTweaker()

    def __init__(self, path):
        logger.info("Initializing parser for %s" % path)
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.party_list = self.get_party_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(self.case_details,
                                                    'docket_num')
        self.pacer_case_id = self.get_str_from_node(self.case_details,
                                                    'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(self.case_details,
                                                      'date_case_filed',
                                                      cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(self.case_details,
                                                            'date_last_filing',
                                                            cast_to_date=True)
        self.case_name = harmonize(
            self.get_str_from_node(self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(self.case_details,
                                                     'nature_of_suit')
        self.jury_demand = self.get_str_from_node(self.case_details,
                                                  'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = get_blocked_status(
            self, self.document_count)

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)

    def get_xml_contents(self):
        """Extract the XML from the file on disk and return it as an lxml
        tree
        """
        xml_parser = etree.XMLParser(recover=True)
        tree = etree.parse(self.path, xml_parser)

        return tree

    def get_case_details(self):
        """Most of the details are in the case_details node, so set it aside
        for faster parsing.
        """
        return self.xml.xpath('//case_details')[0]

    def get_document_list(self):
        """Get the XML nodes for the documents"""
        return self.xml.xpath('//document_list/document')

    def get_party_list(self):
        """Get the XML nodes for the parties"""
        return self.xml.xpath('//party_list/party')

    def get_document_count(self):
        """Get the number of documents associated with this docket."""
        return len(self.document_list)

    def make_documents(self, docket, debug):
        """Parse through the document nodes, making good objects.

        For every node, create a line item on the Docket (a DocketEntry), and
        create 1..n additional RECAPDocuments (attachments or regular documents)
        that are associated with that DocketEntry.

        Returns None if an error occurs.
        """
        recap_docs = []
        for doc_node in self.document_list:
            # Make a DocketEntry object
            entry_number = doc_node.xpath('@doc_num')[0]
            attachment_number = int(doc_node.xpath('@attachment_num')[0])
            logger.info("Working on document %s, attachment %s" %
                        (entry_number, attachment_number))

            if attachment_number == 0:
                document_type = RECAPDocument.PACER_DOCUMENT
            else:
                document_type = RECAPDocument.ATTACHMENT

            try:
                docket_entry = DocketEntry.objects.get(
                    docket=docket,
                    entry_number=entry_number,
                )
            except DocketEntry.DoesNotExist:
                if document_type == RECAPDocument.PACER_DOCUMENT:
                    docket_entry = DocketEntry(
                        docket=docket,
                        entry_number=entry_number,
                    )
                else:
                    logger.error("Tried to create attachment without a "
                                 "DocketEntry object to associate it with.")
                    continue

            if document_type == RECAPDocument.PACER_DOCUMENT:
                date_filed = (self.get_datetime_from_node(
                    doc_node, 'date_filed', cast_to_date=True)
                              or docket_entry.date_filed)
                docket_entry.date_filed = date_filed
                docket_entry.description = (self.get_str_from_node(
                    doc_node, 'long_desc') or docket_entry.description)
                try:
                    if not debug:
                        docket_entry.save()
                except (IntegrityError, DocketEntry.MultipleObjectsReturned):
                    logger.error("Unable to create docket entry for docket "
                                 "#%s, on entry: %s." % (docket, entry_number))
                    continue

            recap_doc = self.make_recap_document(
                doc_node,
                docket_entry,
                entry_number,
                attachment_number,
                document_type,
                debug,
            )
            if recap_doc is not None:
                recap_docs.append(recap_doc)

        return [item.pk for item in recap_docs]

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        if not rd.is_available:
            # If we can't parse the availability node (it returns None),
            # default it to False.
            availability = self.get_bool_from_node(doc_node, 'available')
            rd.is_available = False if availability is None else availability
        if not rd.sha1:
            rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc')
                          or rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            rd.save(do_extraction=False, index=False)
        return rd

    @transaction.atomic
    def make_parties(self, docket, debug):
        """Pull out the parties and their attorneys and save them to the DB."""
        atty_obj_cache = {}
        for party_node in self.party_list:
            party_name = self.get_str_from_node(party_node, 'name')
            party_type = self.get_str_from_node(party_node, 'type')
            party_type = normalize_party_types(party_type)
            party_extra_info = self.get_str_from_node(party_node, 'extra_info')
            logger.info("Working on party '%s' of type '%s'" %
                        (party_name, party_type))

            try:
                party = Party.objects.get(name=party_name)
            except Party.DoesNotExist:
                party = Party(name=party_name)
                if not debug:
                    try:
                        party.save()
                    except IntegrityError:
                        party = Party.objects.get(name=party_name)

            # If the party type doesn't exist, make a new one.
            pts = party.party_types.filter(docket=docket, name=party_type)
            if pts.exists():
                pts.update(extra_info=party_extra_info)
            else:
                pt = PartyType(
                    docket=docket,
                    party=party,
                    name=party_type,
                    extra_info=party_extra_info,
                )
                if not debug:
                    pt.save()

            self.add_attorneys(docket, party_node, party, atty_obj_cache,
                               debug)

    def add_attorneys(self, docket, party_node, party, atty_obj_cache, debug):
        atty_nodes = party_node.xpath('.//attorney_list/attorney')
        logger.info("Adding %s attorneys to the party." % len(atty_nodes))
        for atty_node in atty_nodes:
            atty_name = self.get_str_from_node(atty_node, 'attorney_name')
            logger.info("Adding attorney: '%s'" % atty_name)
            atty_contact_raw = self.get_str_from_node(atty_node, 'contact')
            if 'see above' in atty_contact_raw.lower():
                logger.info("Got 'see above' entry for atty_contact_raw.")
                atty_contact_raw = ''
                try:
                    atty, atty_org_info, atty_info = atty_obj_cache[atty_name]
                except KeyError:
                    logger.warn("Unable to lookup 'see above' entry. "
                                "Creating/using atty with no contact info.")
                    try:
                        atty = Attorney.objects.get(
                            name=atty_name, contact_raw=atty_contact_raw)
                    except Attorney.DoesNotExist:
                        atty = Attorney(name=atty_name,
                                        contact_raw=atty_contact_raw)
                        if not debug:
                            atty.save()

            else:
                # New attorney for this docket. Look them up in DB or create new
                # attorney if necessary.
                atty_org_info, atty_info = normalize_attorney_contact(
                    atty_contact_raw, fallback_name=atty_name)
                try:
                    logger.info("Didn't find attorney in cache, attempting "
                                "lookup in the DB.")
                    # Find an atty with the same name and one of another several
                    # IDs. Important to add contact_raw here, b/c if it cannot
                    # be parsed, all other values are blank.
                    q = Q()
                    fields = [
                        ('phone', atty_info['phone']),
                        ('fax', atty_info['fax']),
                        ('email', atty_info['email']),
                        ('contact_raw', atty_contact_raw),
                        ('organizations__lookup_key',
                         atty_org_info.get('lookup_key')),
                    ]
                    for field, lookup in fields:
                        if lookup:
                            q |= Q(**{field: lookup})
                    atty = Attorney.objects.get(Q(name=atty_name) & q)
                except Attorney.DoesNotExist:
                    logger.info("Unable to find matching attorney. Creating a "
                                "new one: %s" % atty_name)
                    atty = Attorney(name=atty_name,
                                    contact_raw=atty_contact_raw)
                    if not debug:
                        atty.save()
                except Attorney.MultipleObjectsReturned:
                    logger.warn("Got too many results for attorney: '%s' "
                                "Punting." % atty_name)
                    continue

                # Cache the atty object and info for "See above" entries.
                atty_obj_cache[atty_name] = (atty, atty_org_info, atty_info)

            if atty_contact_raw:
                if atty_org_info:
                    logger.info("Adding organization information to "
                                "'%s': %s" % (atty_name, atty_org_info))
                    try:
                        org = AttorneyOrganization.objects.get(
                            lookup_key=atty_org_info['lookup_key'], )
                    except AttorneyOrganization.DoesNotExist:
                        org = AttorneyOrganization(**atty_org_info)
                        if not debug:
                            org.save()

                    # Add the attorney to the organization
                    if not debug:
                        AttorneyOrganizationAssociation.objects.get_or_create(
                            attorney=atty,
                            attorney_organization=org,
                            docket=docket,
                        )

                if atty_info:
                    atty.contact_raw = atty_contact_raw
                    atty.email = atty_info['email']
                    atty.phone = atty_info['phone']
                    atty.fax = atty_info['fax']
                    if not debug:
                        atty.save()

            atty_role_str = self.get_str_from_node(atty_node, 'attorney_role')
            atty_roles = [
                normalize_attorney_role(r) for r in atty_role_str.split('\n')
                if r
            ]
            atty_roles = [r for r in atty_roles if r['role'] is not None]
            atty_roles = remove_duplicate_dicts(atty_roles)
            if len(atty_roles) > 0:
                logger.info(
                    "Linking attorney '%s' to party '%s' via %s "
                    "roles: %s" %
                    (atty_name, party.name, len(atty_roles), atty_roles))
            else:
                logger.info("No role data parsed. Linking via 'UNKNOWN' role.")
                atty_roles = [{'role': Role.UNKNOWN, 'date_action': None}]

            if not debug:
                # Delete the old roles, replace with new.
                Role.objects.filter(attorney=atty, party=party,
                                    docket=docket).delete()
                Role.objects.bulk_create([
                    Role(attorney=atty,
                         party=party,
                         docket=docket,
                         **atty_role) for atty_role in atty_roles
                ])

    def get_court(self):
        """Extract the court from the XML and return it as a Court object"""
        court_str = self.case_details.xpath('court/text()')[0].strip()
        try:
            c = Court.objects.get(pk=map_pacer_to_cl_id(court_str))
        except Court.DoesNotExist:
            raise ParsingException("Unable to identify court: %s" % court_str)
        else:
            return c

    @staticmethod
    def get_bool_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
            n = int(s)
        except IndexError:
            logger.debug("Couldn't get bool from path: %s" % path)
            return None
        except ValueError:
            logger.debug(
                "Couldn't convert text '%s' to int when making boolean "
                "for path: %s" % (s, path))
            return None
        else:
            return bool(n)

    @staticmethod
    def get_str_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            logger.debug("Couldn't get string from path: %s" % path)
            return ''  # Return an empty string. Don't return None.
        else:
            return s

    def get_int_from_details(self, node):
        s = self.case_details.xpath('%s/text()' % node)[0].strip()
        try:
            return int(s)
        except ValueError:
            # Can't parse string to int
            logger.debug("Couldn't get int for node %s" % node)
            raise ParsingException("Cannot extract int for node %s" % node)

    @staticmethod
    def get_datetime_from_node(node, path, cast_to_date=False):
        """Parse a datetime from the XML located at node.

        If cast_to_date is true, the datetime object will be converted to a
        date. Else, will return a datetime object in parsed TZ if possible.
        Failing that, it will assume UTC.
        """
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            logger.debug("Couldn't get date from path: %s" % path)
            return None
        else:
            try:
                d = parser.parse(s)
            except ValueError:
                logger.debug("Couldn't parse date: %s" % s)
                return None
            else:
                d = d.replace(tzinfo=d.tzinfo
                              or gettz('UTC'))  # Set it to UTC.
                if cast_to_date is True:
                    return d.date()
                return d

    def get_judges(self, node):
        """Parse out the judge string and then look it up in the DB"""
        try:
            s = self.case_details.xpath('%s/text()' % node)[0].strip()
        except IndexError:
            logger.info("Couldn't get judge for node: %s" % node)
            return None, ''
        else:
            judges = get_candidate_judges(s, self.court.pk, self.date_filed)
            if len(judges) == 0:
                return None, s
            elif len(judges) == 1:
                return judges[0], s
            else:
                return None, s