示例#1
0
    def __init__(self, cnt=None):
        super(AbstractSite, self).__init__()

        # Computed metadata
        self.hash = None
        self.html = None
        self.method = "GET"
        self.back_scrape_iterable = None
        self.downloader_executed = False
        self.cookies = {}
        self.cnt = cnt or CaseNameTweaker()
        self.request = {
            "verify": certifi.where(),
            "session": requests.session(),
            "headers": {"User-Agent": "Juriscraper"},
            # Disable CDN caching on sites like SCOTUS (ahem)
            "cache-control": "no-cache, no-store, max-age=1",
            "parameters": {},
            "request": None,
            "status": None,
            "url": None,
        }

        # Sub-classed metadata
        self.court_id = None
        self.url = None
        self.parameters = None
        self.uses_selenium = None
        self._opt_attrs = []
        self._req_attrs = []
        self._all_attrs = []
示例#2
0
    def __init__(self, cnt=None):
        super(AbstractSite, self).__init__()

        # Computed metadata
        self.hash = None
        self.html = None
        self.method = 'GET'
        self.back_scrape_iterable = None
        self.downloader_executed = False
        self.cookies = {}
        self.cnt = cnt or CaseNameTweaker()
        self.request = {
            'verify': certifi.where(),
            'session': requests.session(),
            'headers': {
                'User-Agent': 'Juriscraper'
            },
            # Disable CDN caching on sites like SCOTUS (ahem)
            'cache-control': 'no-cache, no-store, max-age=1',
            'parameters': {},
            'request': None,
            'status': None,
            'url': None,
        }

        # Sub-classed metadata
        self.court_id = None
        self.url = None
        self.parameters = None
        self.uses_selenium = None
        self._opt_attrs = []
        self._req_attrs = []
        self._all_attrs = []
示例#3
0
def get_pdfs(options):
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = options["queue"]
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info("%s %s items from PACER." % (task_name, count))
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in queryset_generator(rows):
        throttle.maybe_wait()
        if completed % 30000 == 0:
            pacer_session = PacerSession(
                username=PACER_USERNAME, password=PACER_PASSWORD
            )
            pacer_session.login()
        c = chain(
            process_free_opinion_result.si(row.pk, cnt).set(queue=q),
            get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                "Sent %s/%s tasks to celery for %s so "
                "far." % (completed, count, task_name)
            )
示例#4
0
def get_pdfs(options: OptionsType) -> None:
    """Get PDFs for the results of the Free Document Report queries.

    At this stage, we have rows in the PACERFreeDocumentRow table, each of
    which represents a PDF we need to download and merge into our normal
    tables: Docket, DocketEntry, and RECAPDocument.

    In this function, we iterate over the entire table of results, merge it
    into our normal tables, and then download and extract the PDF.

    :return: None
    """
    q = cast(str, options["queue"])
    index = options["index"]
    cnt = CaseNameTweaker()
    rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk")
    count = rows.count()
    task_name = "downloading"
    if index:
        task_name += " and indexing"
    logger.info(f"{task_name} {count} items from PACER.")
    throttle = CeleryThrottle(queue_name=q)
    completed = 0
    for row in rows.iterator():
        throttle.maybe_wait()
        c = chain(
            process_free_opinion_result.si(
                row.pk,
                row.court_id,
                cnt,
            ).set(queue=q),
            get_and_process_free_pdf.s(row.pk, row.court_id).set(queue=q),
            delete_pacer_row.s(row.pk).set(queue=q),
        )
        if index:
            c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q)
        c.apply_async()
        completed += 1
        if completed % 1000 == 0:
            logger.info(
                f"Sent {completed}/{count} tasks to celery for {task_name} so far."
            )
示例#5
0
    def __init__(self, cnt=None):
        super(AbstractSite, self).__init__()

        # Computed metadata
        self.hash = None
        self.html = None
        self.method = 'GET'
        self.use_sessions = False
        self.status = None
        self.back_scrape_iterable = None
        self.cookies = {}
        self.cnt = cnt or CaseNameTweaker()

        # Sub-classed metadata
        self.court_id = None
        self.url = None
        self.parameters = None
        self.uses_selenium = None
        self._opt_attrs = []
        self._req_attrs = []
        self._all_attrs = []
 def __init__(self, stdout=None, stderr=None, no_color=False):
     super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
     self.cnt = CaseNameTweaker()
class Command(VerboseCommand):
    help = 'Runs the Juriscraper toolkit against one or many jurisdictions.'

    def __init__(self, stdout=None, stderr=None, no_color=False):
        super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
        self.cnt = CaseNameTweaker()

    def add_arguments(self, parser):
        parser.add_argument(
            '--daemon',
            action='store_true',
            default=False,
            help=('Use this flag to turn on daemon mode, in which all '
                  'courts requested will be scraped in turn, '
                  'nonstop, in a loop.'),
        )
        parser.add_argument(
            '--rate',
            type=int,
            default=30,
            help=('The length of time in minutes it takes to crawl '
                  'all requested courts. Particularly useful if it is '
                  'desired to quickly scrape over all courts. Default '
                  'is 30 minutes.'),
        )
        parser.add_argument(
            '--courts',
            type=str,
            dest='court_id',
            metavar="COURTID",
            required=True,
            help=('The court(s) to scrape and extract. This should be '
                  'in the form of a python module or package import '
                  'from the Juriscraper library, e.g. '
                  '"juriscraper.opinions.united_states.federal_appellate.ca1" '
                  'or simply "opinions" to do all opinions.'),
        )
        parser.add_argument(
            '--fullcrawl',
            dest='full_crawl',
            action='store_true',
            default=False,
            help="Disable duplicate aborting.",
        )

    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error

    def save_everything(self, items, index=False, backscrape=False):
        """Saves all the sub items and associates them as appropriate.
        """
        docket, cluster = items['docket'], items['cluster']
        opinion, citations = items['opinion'], items['citations']
        docket.save()
        cluster.docket = docket
        cluster.save(index=False)  # Index only when the opinion is associated.

        for citation in citations:
            citation.cluster_id = cluster.pk
            citation.save()

        if cluster.judges:
            candidate_judges = get_candidate_judges(
                cluster.judges,
                docket.court.pk,
                cluster.date_filed,
            )
            if len(candidate_judges) == 1:
                opinion.author = candidate_judges[0]

            if len(candidate_judges) > 1:
                for candidate in candidate_judges:
                    cluster.panel.add(candidate)

        opinion.cluster = cluster
        opinion.save(index=index)
        if not backscrape:
            RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk)

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(item['download_urls'],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                if (court_str == 'nev'
                        and item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {
                        'lookup_value': item['download_urls'],
                        'lookup_by': 'download_url'
                    }
                else:
                    lookup_params = {
                        'lookup_value': sha1_hash,
                        'lookup_by': 'sha1'
                    }

                onwards = dup_checker.press_on(Opinion, current_date,
                                               next_date, **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content)

                    if error:
                        download_error = True
                        continue

                    self.save_everything(items={
                        'docket': docket,
                        'opinion': opinion,
                        'cluster': cluster,
                        'citations': citations,
                    },
                                         index=False)
                    extract_doc_content.delay(
                        opinion.pk,
                        do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

    def parse_and_scrape_site(self, mod, full_crawl):
        site = mod.Site().parse()
        self.scrape_court(site, full_crawl)

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
示例#8
0
class PacerXMLParser(object):
    """A class to parse a PACER XML file"""

    cnt = CaseNameTweaker()

    def __init__(self, path):
        logger.info("Initializing parser for %s" % path)
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.party_list = self.get_party_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(self.case_details,
                                                    'docket_num')
        self.pacer_case_id = self.get_str_from_node(self.case_details,
                                                    'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(self.case_details,
                                                      'date_case_filed',
                                                      cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(self.case_details,
                                                            'date_last_filing',
                                                            cast_to_date=True)
        self.case_name = harmonize(
            self.get_str_from_node(self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(self.case_details,
                                                     'nature_of_suit')
        self.jury_demand = self.get_str_from_node(self.case_details,
                                                  'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = get_blocked_status(
            self, self.document_count)

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)

    def get_xml_contents(self):
        """Extract the XML from the file on disk and return it as an lxml
        tree
        """
        xml_parser = etree.XMLParser(recover=True)
        tree = etree.parse(self.path, xml_parser)

        return tree

    def get_case_details(self):
        """Most of the details are in the case_details node, so set it aside
        for faster parsing.
        """
        return self.xml.xpath('//case_details')[0]

    def get_document_list(self):
        """Get the XML nodes for the documents"""
        return self.xml.xpath('//document_list/document')

    def get_party_list(self):
        """Get the XML nodes for the parties"""
        return self.xml.xpath('//party_list/party')

    def get_document_count(self):
        """Get the number of documents associated with this docket."""
        return len(self.document_list)

    def make_documents(self, docket, debug):
        """Parse through the document nodes, making good objects.

        For every node, create a line item on the Docket (a DocketEntry), and
        create 1..n additional RECAPDocuments (attachments or regular documents)
        that are associated with that DocketEntry.

        Returns None if an error occurs.
        """
        recap_docs = []
        for doc_node in self.document_list:
            # Make a DocketEntry object
            entry_number = doc_node.xpath('@doc_num')[0]
            attachment_number = int(doc_node.xpath('@attachment_num')[0])
            logger.info("Working on document %s, attachment %s" %
                        (entry_number, attachment_number))

            if attachment_number == 0:
                document_type = RECAPDocument.PACER_DOCUMENT
            else:
                document_type = RECAPDocument.ATTACHMENT

            try:
                docket_entry = DocketEntry.objects.get(
                    docket=docket,
                    entry_number=entry_number,
                )
            except DocketEntry.DoesNotExist:
                if document_type == RECAPDocument.PACER_DOCUMENT:
                    docket_entry = DocketEntry(
                        docket=docket,
                        entry_number=entry_number,
                    )
                else:
                    logger.error("Tried to create attachment without a "
                                 "DocketEntry object to associate it with.")
                    continue

            if document_type == RECAPDocument.PACER_DOCUMENT:
                date_filed = (self.get_datetime_from_node(
                    doc_node, 'date_filed', cast_to_date=True)
                              or docket_entry.date_filed)
                docket_entry.date_filed = date_filed
                docket_entry.description = (self.get_str_from_node(
                    doc_node, 'long_desc') or docket_entry.description)
                try:
                    if not debug:
                        docket_entry.save()
                except (IntegrityError, DocketEntry.MultipleObjectsReturned):
                    logger.error("Unable to create docket entry for docket "
                                 "#%s, on entry: %s." % (docket, entry_number))
                    continue

            recap_doc = self.make_recap_document(
                doc_node,
                docket_entry,
                entry_number,
                attachment_number,
                document_type,
                debug,
            )
            if recap_doc is not None:
                recap_docs.append(recap_doc)

        return [item.pk for item in recap_docs]

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            rd = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            rd = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
                document_number=entry_number,
            )
        else:
            rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id

        rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        rd.document_type = document_type or rd.document_type

        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        rd.is_available = False if availability is None else availability
        rd.sha1 = self.get_str_from_node(doc_node, 'sha1')
        rd.description = (self.get_str_from_node(doc_node, 'short_desc')
                          or rd.description)
        if rd.is_available:
            rd.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            rd.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if rd.page_count is None:
                extension = rd.filepath_local.path.split('.')[-1]
                rd.page_count = get_page_count(rd.filepath_local.path,
                                               extension)
        if document_type == RECAPDocument.ATTACHMENT:
            rd.attachment_number = attachment_number
        if not debug:
            try:
                rd.save(do_extraction=False, index=False)
            except IntegrityError as e:
                # This happens when a pacer_doc_id has been wrongly set as
                # the document_number, see for example, document 19 and
                # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml
                logger.error(
                    "Unable to create RECAPDocument for document #%s, "
                    "attachment #%s on entry: %s due to "
                    "IntegrityError." %
                    (rd.document_number, rd.attachment_number,
                     rd.docket_entry))
                return None
        return rd

    @transaction.atomic
    def make_parties(self, docket, debug):
        """Pull out the parties and their attorneys and save them to the DB."""
        atty_obj_cache = {}
        for party_node in self.party_list:
            party_name = self.get_str_from_node(party_node, 'name')
            party_type = self.get_str_from_node(party_node, 'type')
            party_type = normalize_party_types(party_type)
            party_extra_info = self.get_str_from_node(party_node, 'extra_info')
            logger.info("Working on party '%s' of type '%s'" %
                        (party_name, party_type))

            try:
                party = Party.objects.get(name=party_name)
            except Party.DoesNotExist:
                party = Party(
                    name=party_name,
                    extra_info=party_extra_info,
                )
                if not debug:
                    party.save()
            else:
                if party_extra_info and not debug:
                    party.extra_info = party_extra_info
                    party.save()

            # If the party type doesn't exist, make a new one.
            if not party.party_types.filter(docket=docket,
                                            name=party_type).exists():
                pt = PartyType(
                    docket=docket,
                    party=party,
                    name=party_type,
                )
                if not debug:
                    pt.save()

            self.add_attorneys(docket, party_node, party, atty_obj_cache,
                               debug)

    def add_attorneys(self, docket, party_node, party, atty_obj_cache, debug):
        # Get the most recent date on the docket. We'll use this to have the
        # most updated attorney info.
        newest_docket_date = max([
            d for d in [
                docket.date_filed, docket.date_terminated,
                docket.date_last_filing
            ] if d
        ], )
        atty_nodes = party_node.xpath('.//attorney_list/attorney')
        logger.info("Adding %s attorneys to the party." % len(atty_nodes))
        for atty_node in atty_nodes:
            atty_name = self.get_str_from_node(atty_node, 'attorney_name')
            logger.info("Adding attorney: '%s'" % atty_name)
            atty_contact_raw = self.get_str_from_node(atty_node, 'contact')
            if 'see above' in atty_contact_raw.lower():
                logger.info("Got 'see above' entry for atty_contact_raw.")
                atty_contact_raw = ''
                try:
                    atty, atty_org_info, atty_info = atty_obj_cache[atty_name]
                except KeyError:
                    logger.warn("Unable to lookup 'see above' entry. "
                                "Creating/using atty with no contact info.")
                    try:
                        atty = Attorney.objects.get(
                            name=atty_name, contact_raw=atty_contact_raw)
                    except Attorney.DoesNotExist:
                        atty = Attorney(name=atty_name,
                                        date_sourced=newest_docket_date,
                                        contact_raw=atty_contact_raw)
                        if not debug:
                            atty.save()

            else:
                # New attorney for this docket. Look them up in DB or create new
                # attorney if necessary.
                atty_org_info, atty_info = normalize_attorney_contact(
                    atty_contact_raw, fallback_name=atty_name)
                try:
                    logger.info("Didn't find attorney in cache, attempting "
                                "lookup in the DB.")
                    # Find an atty with the same name and one of another several
                    # IDs. Important to add contact_raw here, b/c if it cannot
                    # be parsed, all other values are blank.
                    q = Q()
                    fields = [
                        ('phone', atty_info['phone']),
                        ('fax', atty_info['fax']),
                        ('email', atty_info['email']),
                        ('contact_raw', atty_contact_raw),
                        ('organizations__lookup_key',
                         atty_org_info.get('lookup_key')),
                    ]
                    for field, lookup in fields:
                        if lookup:
                            q |= Q(**{field: lookup})
                    atty = Attorney.objects.get(Q(name=atty_name) & q)
                except Attorney.DoesNotExist:
                    logger.info("Unable to find matching attorney. Creating a "
                                "new one: %s" % atty_name)
                    atty = Attorney(name=atty_name,
                                    date_sourced=newest_docket_date,
                                    contact_raw=atty_contact_raw)
                    if not debug:
                        atty.save()
                except Attorney.MultipleObjectsReturned:
                    logger.warn("Got too many results for attorney: '%s' "
                                "Punting." % atty_name)
                    continue

                # Cache the atty object and info for "See above" entries.
                atty_obj_cache[atty_name] = (atty, atty_org_info, atty_info)

            if atty_contact_raw:
                if atty_org_info:
                    logger.info("Adding organization information to "
                                "'%s': %s" % (atty_name, atty_org_info))
                    try:
                        org = AttorneyOrganization.objects.get(
                            lookup_key=atty_org_info['lookup_key'], )
                    except AttorneyOrganization.DoesNotExist:
                        org = AttorneyOrganization(**atty_org_info)
                        if not debug:
                            org.save()

                    # Add the attorney to the organization
                    if not debug:
                        AttorneyOrganizationAssociation.objects.get_or_create(
                            attorney=atty,
                            attorney_organization=org,
                            docket=docket,
                        )

                atty_info_is_newer = (atty.date_sourced <= newest_docket_date)
                if atty_info and atty_info_is_newer:
                    logger.info("Updating atty info because %s is more recent "
                                "than %s." %
                                (newest_docket_date, atty.date_sourced))
                    atty.date_sourced = newest_docket_date
                    atty.contact_raw = atty_contact_raw
                    atty.email = atty_info['email']
                    atty.phone = atty_info['phone']
                    atty.fax = atty_info['fax']
                    if not debug:
                        atty.save()

            atty_role_str = self.get_str_from_node(atty_node, 'attorney_role')
            atty_roles = [
                normalize_attorney_role(r) for r in atty_role_str.split('\n')
                if r
            ]
            atty_roles = [r for r in atty_roles if r['role'] is not None]
            atty_roles = remove_duplicate_dicts(atty_roles)
            if len(atty_roles) > 0:
                logger.info(
                    "Linking attorney '%s' to party '%s' via %s "
                    "roles: %s" %
                    (atty_name, party.name, len(atty_roles), atty_roles))
            else:
                logger.info("No role data parsed. Linking via 'UNKNOWN' role.")
                atty_roles = [{'role': Role.UNKNOWN, 'date_action': None}]

            if not debug:
                # Delete the old roles, replace with new.
                Role.objects.filter(attorney=atty, party=party,
                                    docket=docket).delete()
                Role.objects.bulk_create([
                    Role(attorney=atty,
                         party=party,
                         docket=docket,
                         **atty_role) for atty_role in atty_roles
                ])

    def get_court(self):
        """Extract the court from the XML and return it as a Court object"""
        court_str = self.case_details.xpath('court/text()')[0].strip()
        try:
            c = Court.objects.get(pk=map_pacer_to_cl_id(court_str))
        except Court.DoesNotExist:
            raise ParsingException("Unable to identify court: %s" % court_str)
        else:
            return c

    @staticmethod
    def get_bool_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
            n = int(s)
        except IndexError:
            logger.debug("Couldn't get bool from path: %s" % path)
            return None
        except ValueError:
            logger.debug(
                "Couldn't convert text '%s' to int when making boolean "
                "for path: %s" % (s, path))
            return None
        else:
            return bool(n)

    @staticmethod
    def get_str_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            logger.debug("Couldn't get string from path: %s" % path)
            return ''  # Return an empty string. Don't return None.
        else:
            return s

    def get_int_from_details(self, node):
        s = self.case_details.xpath('%s/text()' % node)[0].strip()
        try:
            return int(s)
        except ValueError:
            # Can't parse string to int
            logger.debug("Couldn't get int for node %s" % node)
            raise ParsingException("Cannot extract int for node %s" % node)

    @staticmethod
    def get_datetime_from_node(node, path, cast_to_date=False):
        """Parse a datetime from the XML located at node.

        If cast_to_date is true, the datetime object will be converted to a
        date. Else, will return a datetime object in parsed TZ if possible.
        Failing that, it will assume UTC.
        """
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            logger.debug("Couldn't get date from path: %s" % path)
            return None
        else:
            try:
                d = parser.parse(s)
            except ValueError:
                logger.debug("Couldn't parse date: %s" % s)
                return None
            else:
                d = d.replace(tzinfo=d.tzinfo
                              or gettz('UTC'))  # Set it to UTC.
                if cast_to_date is True:
                    return d.date()
                return d

    def get_judges(self, node):
        """Parse out the judge string and then look it up in the DB"""
        try:
            s = self.case_details.xpath('%s/text()' % node)[0].strip()
        except IndexError:
            logger.info("Couldn't get judge for node: %s" % node)
            return None, ''
        else:
            judges = get_candidate_judges(s, self.court.pk, self.date_filed)
            if len(judges) == 0:
                return None, s
            elif len(judges) == 1:
                return judges[0], s
            else:
                return None, s
 def test_make_short_name(self):
     test_pairs = [
         # In re and Matter of
         ("In re Lissner", "In re Lissner"),
         ("Matter of Lissner", "Matter of Lissner"),
         # Plaintiff is in bad word list
         ("State v. Lissner", "Lissner"),
         ("People v. Lissner", "Lissner"),
         ("California v. Lissner", "Lissner"),
         ("Dallas v. Lissner", "Lissner"),
         # Basic 3-word case
         ("Langley v. Google", "Langley"),
         # Similar to above, but more than 3 words
         ("Langley v. Google foo", "Langley"),
         # United States v. ...
         ("United States v. Lissner", "Lissner"),
         # Corporate first name
         ("Google, Inc. v. Langley", "Langley"),
         ("Special, LLC v. Langley", "Langley"),
         ("Google Corp. v. Langley", "Langley"),
         # Shorter appellant than plaintiff
         ("Michael Lissner v. Langley", "Langley"),
         # Multi-v with and w/o a bad_word
         ("Alameda v. Victor v. Keyboard", ""),
         ("Bloggers v. Victor v. Keyboard", ""),
         # Long left, short right
         ("Many words here v. Langley", "Langley"),
         # Other manually added items
         ("Ilarion v. State", "Ilarion"),
         ("Imery v. Vangil Ingenieros", "Imery"),
         # Many more tests from real data!
         ("Bean v. City of Monahans", "Bean"),
         ("Blanke v. Time, Inc.", "Blanke"),
         ("New York Life Ins. Co. v. Deshotel", "Deshotel"),
         ("Deatherage v. Deatherage", "Deatherage"),
         ("Gonzalez Vargas v. Holder", ""),
         ("Campbell v. Wainwright", "Campbell"),
         ("Liggett & Myers Tobacco Co. v. Finzer", "Finzer"),
         ("United States v. Brenes", "Brenes"),
         ("A.H. Robins Co., Inc. v. Eli Lilly & Co", ""),
         ("McKellar v. Hazen", "McKellar"),
         ("Gil v. State", "Gil"),
         ("Fuentes v. Owen", "Fuentes"),
         ("State v. Shearer", "Shearer"),
         ("United States v. Smither", "Smither"),
         ("People v. Bradbury", "Bradbury"),
         ("Venable (James) v. State", ""),
         ("Burkhardt v. Bailey", "Burkhardt"),
         ("DeLorenzo v. Bales", "DeLorenzo"),
         ("Loucks v. Bauman", "Loucks"),
         ("Kenneth Stern v. Robert Weinstein", ""),
         ("Rayner v. Secretary of Health and Human Services", "Rayner"),
         ("Rhyne v. Martin", "Rhyne"),
         ("State v. Wolverton", "Wolverton"),
         ("State v. Flood", "Flood"),
         ("Amason v. Natural Gas Pipeline Co.", "Amason"),
         ("United States v. Bryant", "Bryant"),
         ("WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA", ""),
         ("Stewart v. Tupperware Corp.", "Stewart"),
         ("Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY", ""),
         ("Stein v. State Tax Commission", "Stein"),
         (
             "The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley",
             "",
         ),
         ("People v. Armstrong", "Armstrong"),
         ("Weeks v. Weeks", "Weeks"),
         ("Smith v. Xerox Corp.", ""),
         ("In Interest of Ad", ""),
         ("People v. Forsyth", "Forsyth"),
         ("State v. LeClair", "LeClair"),
         ("Agristor Credit Corp. v. Unruh", "Unruh"),
         ("United States v. Larry L. Stewart", ""),
         ("Starling v. United States", "Starling"),
         ("United States v. Pablo Colin-Molina", ""),
         ("Kenneth N. Juhl v. The United States", ""),
         ("Matter of Wilson", "Matter of Wilson"),
         ("In Re Damon H.", ""),
         ("Centennial Ins. Co. v. Zylberberg", "Zylberberg"),
         ("United States v. Donald Lee Stotler", ""),
         ("Byndloss v. State", "Byndloss"),
         ("People v. Piatkowski", "Piatkowski"),
         ("United States v. Willie James Morgan", ""),
         ("Harbison (Debra) v. Thieret (James)", ""),
         ("Federal Land Bank of Columbia v. Lieben", "Lieben"),
         ("John Willard Greywind v. John T. Podrebarac", ""),
         ("State v. Powell", "Powell"),
         ("Carr v. Galloway", "Carr"),
         ("Saylors v. State", "Saylors"),
         ("Jones v. Franke", "Jones"),
         (
             "In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde "
             "Titleholders, Inc., a California Corporation",
             "",
         ),
         (
             "Pollenex Corporation v. Sunbeam-Home Comfort, a Division of "
             "Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing "
             "Corporation of North America",
             "",
         ),
         ("Longs v. State", "Longs"),
         ("Performance Network Solutions v. Cyberklix", "Cyberklix"),
         ("DiSabatino v. Salicete", "DiSabatino"),
         ("State v. Jennifer Nicole Jackson", ""),
         ("United States v. Moreno", "Moreno"),
         ("LOGAN & KANAWHA COAL v. Banque Francaise", ""),
         ("State v. Harrison", "Harrison"),
         ("Efford v. Milam", "Efford"),
         ("People v. Thompson", "Thompson"),
         ("CINCINNATI THERMAL SPRAY v. Pender County", ""),
         ("JAH Ex Rel. RMH v. Wadle & Associates", ""),
         ("United Pub. Employees v. CITY & CTY. OF SAN FRAN.", ""),
         ("Warren v. Massachusetts Indemnity", "Warren"),
         (
             'Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             "",
         ),
         ("Snowdon v. Grillo", "Snowdon"),
         ("Adam Lunsford v. Cravens Funeral Home", ""),
         ("State v. Dillon", "Dillon"),
         ("In Re Graham", "In Re Graham"),
         ("Durham v. Chrysler Corp.", ""),  # Fails b/c Durham is a city!
         ("Carolyn Warrick v. Motiva Enterprises, L.L.C", ""),
         ("United States v. Aloi", "Aloi"),
         ("United States Fidelity & Guaranty v. Graham", "Graham"),
         ("Wildberger v. Rosenbaum", "Wildberger"),
         ("Truck Insurance Exchange v. Michling", "Michling"),
         ("Black Voters v. John J. McDonough", ""),
         ("State of Tennessee v. William F. Cain", ""),
         ("Robert J. Imbrogno v. Defense Logistics Agency", ""),
         ("Leetta Beachum, Administratrix v. Timothy Joseph White", ""),
         ("United States v. Jorge Gonzalez-Villegas", ""),
         ("Pitts v. Florida Bd. of Bar Examiners", "Pitts"),
         ("State v. Pastushin", "Pastushin"),
         ("Clark v. Clark", ""),
         ("Barrios v. Holder", "Barrios"),
         ("Gregory L. Lavin v. United States", ""),
         ("Carpenter v. Consumers Power", "Carpenter"),
         ("Derbabian v. S & C SNOWPLOWING, INC.", "Derbabian"),
         ("Bright v. LSI CORP.", "Bright"),
         ("State v. Brown", "Brown"),
         ("KENNEY v. Keebler Co.", "KENNEY"),
         ("Hill v. Chalanor", "Hill"),
         ("Washington v. New Jersey", ""),
         ("Sollek v. Laseter", "Sollek"),
         (
             "United States v. John Handy Jones, International Fidelity "
             "Insurance Company",
             "",
         ),
         ("N.L.R.B. v. I. W. Corp", ""),
         ("Karpisek v. Cather & Sons Construction, Inc.", "Karpisek"),
         ("Com. v. Wade", "Com."),
         ("Glascock v. Sukumlyn", "Glascock"),
         ("Burroughs v. Hills", "Burroughs"),
         ("State v. Darren Matthew Lee", ""),
         ("Mastondrea v. Occidental Hotels Management", "Mastondrea"),
         ("Kent v. C. I. R", "Kent"),
         ("Johnson v. City of Detroit", ""),
         ("Nolan v. United States", "Nolan"),
         ("Currence v. Denver Tramway Corporation", "Currence"),
         ("Matter of Cano", "Matter of Cano"),
         # Two words after "Matter of --> Punt."
         ("Matter of Alphabet Soup", ""),
         # Zero words after "Matter of" --> Punt.
         ("Matter of", "Matter of"),
         ("Simmons v. Stalder", "Simmons"),
         ("United States v. Donnell Hagood", ""),
         ("Kale v. United States INS", "Kale"),
         ("Cmk v. Department of Revenue Ex Rel. Kb", "Cmk"),
         ("State Farm Mut. Auto. Ins. Co. v. Barnes", "Barnes"),
         ("In Re Krp", "In Re Krp"),
         ("CH v. Department of Children and Families", "CH"),
         ("Com. v. Monosky", "Com."),
         ("JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven", ""),
         ("Carolyn Humphrey v. Memorial Hospitals Association", ""),
         ("Wagner v. Sanders Associates, Inc.", "Wagner"),
         ("United States v. Venie (Arthur G.)", ""),
         ("Mitchell v. State", ""),
         ("City of Biloxi, Miss. v. Giuffrida", "Giuffrida"),
         ("Sexton v. St. Clair Federal Sav. Bank", "Sexton"),
         ("United States v. Matthews", "Matthews"),
         ("Freeman v. Freeman", "Freeman"),
         ("Spencer v. Toussaint", "Spencer"),
         ("In Re Canaday", "In Re Canaday"),
         ("Wenger v. Commission on Judicial Performance", "Wenger"),
         ("Jackson v. Janecka", "Janecka"),
         ("People of Michigan v. Ryan Christopher Smith", ""),
         ("Kincade (Michael) v. State", ""),
         ("Tonubbee v. River Parishes Guide", "Tonubbee"),
         ("United States v. Richiez", "Richiez"),
         ("In Re Allamaras", "In Re Allamaras"),
         ("United States v. Capoccia", "Capoccia"),
         ("Com. v. DeFranco", "Com."),
         ("Matheny v. Porter", "Matheny"),
         ("Piper v. Hoffman", "Piper"),
         ("People v. Smith", ""),  # Punted b/c People and Smith are bad.
         ("Mobuary, Joseph v. State.", ""),  # Punted b/c "State." has punct
     ]
     tweaker = CaseNameTweaker()
     for t in test_pairs:
         output = tweaker.make_case_name_short(t[0])
         self.assertEqual(
             output,
             t[1],
             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'"
             % (t[0], t[1], output),
         )
示例#10
0
class Command(BaseCommand):
    help = 'Migrate all data for all apps from one DB to another.'
    case_name_tweaker = CaseNameTweaker()
    the_beginning_of_time = make_aware(datetime(1750, 1, 1), utc)

    def add_arguments(self, parser):
        parser.add_argument(
            '--search',
            action='store_true',
            default=False,
            help="Do migrations for the models in the search app: opinions, "
                 "oral args, and dockets"
        )
        parser.add_argument(
            '--citations',
            action='store_true',
            default=False,
            help="Do migrations for citations between objects"
        )
        parser.add_argument(
            '--user-stuff',
            action='store_true',
            default=False,
            help="Do migrations for user-related stuff (bar memberships, "
                 "alerts, favorites, donations, etc.)"
        )
        parser.add_argument(
            '--stats',
            action='store_true',
            default=False,
            help="Do migrations for stats"
        )

    def handle(self, *args, **options):
        if options['search']:
            self.migrate_opinions_oral_args_and_dockets()
        if options['citations']:
            self.migrate_intra_object_citations()
        if options['user_stuff']:
            self.migrate_users_profiles_alerts_favorites_and_donations()
        if options['stats']:
            self.migrate_stats()

    @staticmethod
    def _none_to_blank(value):
        """Normalizes a field to be u'' instead of None. This is needed b/c the
        old models erroneously had null=True on a number of text fields. If they
        were set up properly according to Django conventions, they'd disallow
        null and have been set to blank instead.
        """
        if value is None:
            return u''
        else:
            return value

    def _get_case_names(self, case_name_orig):
        case_name_len = len(case_name_orig)
        max_case_name_len = 150
        if case_name_len > max_case_name_len:
            case_name = u''
            case_name_full = case_name_orig
        else:
            case_name = case_name_orig
            case_name_full = u''
        case_name_short = self.case_name_tweaker.make_case_name_short(
            case_name_orig)
        return case_name, case_name_full, case_name_short

    def _print_progress(self, progress, total, errors=None):
        """Print the progress of a migration subcomponent.

        If errors is provided it should be a dict of the form:

          errors = {
            'KeyError': 1982,
            'SomeOtherError': 42,
          }

        That is, error keys should be descriptive strings, and their values
        should be counts of how many times it happened.

        Note that using a collections.Counter object for this is very handy.
        """
        if not errors:
            errors = {}
        self.stdout.write("\r\tMigrated %s of %s (%d%%). Skipped %s: (%s)." % (
            progress,
            total,
            float(progress) / total * 100,
            sum(errors.values()),
            ', '.join(['%s: %s' % (k, v) for k, v in errors.items()]),
        ), ending='')
        self.stdout.flush()

    def migrate_opinions_oral_args_and_dockets(self):
        self.stdout.write("Migrating dockets, audio files, and opinions to new "
                          "database...")
        q = DocketOld.objects.using('old').all()
        old_dockets = queryset_generator(q)
        num_dockets = q.count()

        progress = 0
        self._print_progress(progress, num_dockets)
        for old_docket in old_dockets:
            # First do the docket, then create the cluster and opinion objects.
            try:
                old_audio = old_docket.audio_files.all()[0]
            except IndexError:
                old_audio = None
            try:
                old_document = old_docket.documents.all()[0]
            except IndexError:
                old_document = None
            if old_document is not None:
                old_citation = old_document.citation
                old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name)
            if old_audio is not None:
                old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name)

            court = CourtNew.objects.get(pk=old_docket.court_id)  # Courts are in place thanks to initial data.

            new_docket = DocketNew(
                pk=old_docket.pk,
                date_modified=old_docket.date_modified,
                date_created=old_docket.date_modified,
                court=court,
                case_name=old_doc_case_name,
                case_name_full=old_doc_case_name_full,
                case_name_short=old_doc_case_name_short,
                slug=self._none_to_blank(old_docket.slug),
                docket_number=self._none_to_blank(old_citation.docket_number),
                date_blocked=old_docket.date_blocked,
                blocked=old_docket.blocked,
            )
            if old_audio is not None:
                new_docket.date_argued = old_audio.date_argued
            new_docket.save(using='default')

            if old_document is not None:
                new_opinion_cluster = OpinionClusterNew(
                    pk=old_document.pk,
                    docket=new_docket,
                    judges=self._none_to_blank(old_document.judges),
                    date_modified=old_document.date_modified,
                    date_created=old_document.date_modified,
                    date_filed=old_document.date_filed,
                    slug=self._none_to_blank(old_citation.slug),
                    citation_id=old_document.citation_id,
                    case_name_short=old_doc_case_name_short,
                    case_name=old_doc_case_name,
                    case_name_full=old_doc_case_name_full,
                    federal_cite_one=self._none_to_blank(
                        old_citation.federal_cite_one),
                    federal_cite_two=self._none_to_blank(
                        old_citation.federal_cite_two),
                    federal_cite_three=self._none_to_blank(
                        old_citation.federal_cite_three),
                    state_cite_one=self._none_to_blank(
                        old_citation.state_cite_one),
                    state_cite_two=self._none_to_blank(
                        old_citation.state_cite_two),
                    state_cite_three=self._none_to_blank(
                        old_citation.state_cite_three),
                    state_cite_regional=self._none_to_blank(
                        old_citation.state_cite_regional),
                    specialty_cite_one=self._none_to_blank(
                        old_citation.specialty_cite_one),
                    scotus_early_cite=self._none_to_blank(
                        old_citation.scotus_early_cite),
                    lexis_cite=self._none_to_blank(old_citation.lexis_cite),
                    westlaw_cite=self._none_to_blank(old_citation.westlaw_cite),
                    neutral_cite=self._none_to_blank(old_citation.neutral_cite),
                    scdb_id=self._none_to_blank(
                        old_document.supreme_court_db_id),
                    source=old_document.source,
                    nature_of_suit=old_document.nature_of_suit,
                    citation_count=old_document.citation_count,
                    precedential_status=old_document.precedential_status,
                    date_blocked=old_document.date_blocked,
                    blocked=old_document.blocked,
                )
                new_opinion_cluster.save(
                    using='default',
                    index=False,
                )

                new_opinion = OpinionNew(
                    pk=old_document.pk,
                    cluster=new_opinion_cluster,
                    date_modified=old_document.date_modified,
                    date_created=old_document.time_retrieved,
                    type='010combined',
                    sha1=old_document.sha1,
                    download_url=old_document.download_url,
                    local_path=old_document.local_path,
                    plain_text=old_document.plain_text,
                    html=self._none_to_blank(old_document.html),
                    html_lawbox=self._none_to_blank(old_document.html_lawbox),
                    html_with_citations=old_document.html_with_citations,
                    extracted_by_ocr=old_document.extracted_by_ocr,
                )
                new_opinion.save(
                    using='default',
                    index=False,
                )

            if old_audio is not None:
                new_audio_file = AudioNew(
                    pk=old_audio.pk,
                    docket=new_docket,
                    source=old_audio.source,
                    case_name=old_audio_case_name,
                    case_name_short=old_audio_case_name_short,
                    case_name_full=old_audio_case_name_full,
                    judges=self._none_to_blank(old_audio.judges),
                    date_created=old_audio.time_retrieved,
                    date_modified=old_audio.date_modified,
                    sha1=old_audio.sha1,
                    download_url=old_audio.download_url,
                    local_path_mp3=old_audio.local_path_mp3,
                    local_path_original_file=old_audio.local_path_original_file,
                    duration=old_audio.duration,
                    processing_complete=old_audio.processing_complete,
                    date_blocked=old_audio.date_blocked,
                    blocked=old_audio.blocked,
                )
                new_audio_file.save(
                    using='default',
                    index=False,
                )

            progress += 1
            self._print_progress(progress, num_dockets)
        self.stdout.write(u'')  # Newline

    def migrate_intra_object_citations(self):
        """This method migrates the citations from one database to the other so
        that we don't have to run the citation finding algorithm immediately
        after the migration. Recall that in the legacy schema, Documents have a
        One-2-Many relationship with Citations. This algo handles two kinds of
        citations. The first is the simple case (1 to 1):

                        +--> C2--D2
                       /
            D1--cites--
                       \
                        +--> C3--D3

        This is handled by making a new connection such that D1 cites D2 and D3:

            D1 --cites--> D2

                  and

            D1 --cites--> D3

        The next kind of citation handled is more difficult. In this case,
        multiple Documents share a single Citation (1 to N).

                                 +--D2
                                 |
                        +--> C1--+
                       /         |
            D1--cites--          +--D3
                       \
                        +--> C2--D4

        This is handled by making the original document cite to all the targets:

            D1--cites-->D2
            D1--cites-->D3
            D1--cites-->D4

        """
        self.stdout.write("Migrating citation references to new database...")
        self.stdout.write("\tBuilding lookup dict of Citation IDs to "
                          "Document IDs...")
        # Build lookup dict in memory to avoid DB hits in a moment
        citation_document_pairs = DocumentOld.objects.using(
            'old'
        ).values_list(
            'citation_id',
            'pk'
        )
        # This dict takes the form of:
        #   {
        #      citation_id: [
        #        document_id1,
        #        document_id2,
        #        ...
        #      ],
        #      ...
        #   }
        #
        # The basic idea is that for any citation object's ID, you can lookup a
        # list of the documents that have it associated with them.
        cite_to_doc_dict = {}
        for citation_id, document_pk in citation_document_pairs:
            if citation_id in cite_to_doc_dict:
                cite_to_doc_dict[citation_id].append(document_pk)
            else:
                cite_to_doc_dict[citation_id] = [document_pk]

        # Iterate over all existing citations and move them to the correct place
        self.stdout.write(
            "\tBuilding list of all citations from Documents to Citations..."
        )
        DocumentCitationsOld = DocumentOld.cases_cited.through
        all_citations = DocumentCitationsOld.objects.using('old')
        total_count = all_citations.count()
        citation_values = all_citations.values_list(
            'document_id',
            'citation_id'
        )
        progress = 0
        errors = Counter()
        starting_point = 14514268  # For use with failed scripts.
        self._print_progress(progress, total_count, errors)
        new_citations = []
        for document_id, citation_id in citation_values:
            if progress < starting_point:
                errors.update(['AlreadyDone'])
                progress += 1
                continue
            # Early abort if the Citation object has been deleted from the DB.
            try:
                cited_documents = cite_to_doc_dict[citation_id]
            except KeyError:
                errors.update(['KeyError:OrphanCitation'])
                continue
            for cited_document in cited_documents:
                new_citations.append(
                    OpinionsCitedNew(
                        citing_opinion_id=document_id,
                        cited_opinion_id=cited_document,
                    )
                )
                if len(new_citations) % 100 == 0:
                    try:
                        OpinionsCitedNew.objects.using(
                            'default'
                        ).bulk_create(
                            new_citations
                        )
                    except IntegrityError:
                        # Loop through each opinion and save it, marking the
                        # failures. Could do this in the first place, but it's
                        # slower.
                        for new_citation in new_citations:
                            try:
                                new_citation.save()
                            except IntegrityError:
                                errors.update(['IntegrityError:CiteFromOrToMissingOpinionID'])
                                continue
                    new_citations = []

            progress += 1
            self._print_progress(progress, total_count, errors)

        # One final push if there's anything left.
        if len(new_citations) > 0:
            OpinionsCitedNew.objects.using('default').bulk_create(new_citations)
        self.stdout.write(u'')  # Newline

    def migrate_users_profiles_alerts_favorites_and_donations(self):
        self.stdout.write("Migrating users, profiles, alerts, favorites, and "
                          "donations to the new database...")
        old_users = User.objects.using('old').all()
        num_users = old_users.count()

        progress = 0
        self._print_progress(progress, num_users)
        for old_user in old_users:
            old_profile = old_user.profile_legacy
            old_alerts = old_profile.alert.all()
            old_favorites = old_profile.favorite.all()
            old_donations = old_profile.donation.all()

            new_user = User(
                pk=old_user.pk,
                username=old_user.username,
                first_name=old_user.first_name,
                last_name=old_user.last_name,
                email=old_user.email,
                is_staff=old_user.is_staff,
                is_active=old_user.is_active,
                is_superuser=old_user.is_superuser,
                date_joined=old_user.date_joined,
                last_login=old_user.last_login,
                password=old_user.password,
            )
            new_user.save(using='default')

            new_profile = UserProfileNew(
                pk=old_profile.pk,
                user=new_user,
                stub_account=old_profile.stub_account,
                employer=old_profile.employer,
                address1=old_profile.address1,
                address2=old_profile.address2,
                city=old_profile.city,
                state=old_profile.state,
                zip_code=old_profile.zip_code,
                avatar=old_profile.avatar,
                wants_newsletter=old_profile.wants_newsletter,
                plaintext_preferred=old_profile.plaintext_preferred,
                activation_key=old_profile.activation_key,
                key_expires=old_profile.key_expires,
                email_confirmed=old_profile.email_confirmed,
            )
            new_profile.save(using='default')
            new_profile.barmembership.add(
                *[membership.pk for membership in
                  old_profile.barmembership.all()]
            )

            for old_alert in old_alerts:
                new_alert = AlertNew(
                    pk=old_alert.pk,
                    user=new_user,
                    date_created=self.the_beginning_of_time,
                    date_modified=self.the_beginning_of_time,
                    name=old_alert.name,
                    query=old_alert.query,
                    rate=old_alert.rate,
                    always_send_email=old_alert.always_send_email,
                    date_last_hit=old_alert.date_last_hit,
                )
                new_alert.save(using='default')

            for old_favorite in old_favorites:
                opinion_fave_pk = getattr(old_favorite.doc_id, 'pk', None)
                audio_fave_pk = getattr(old_favorite.audio_id, 'pk', None)
                if opinion_fave_pk is not None:
                    cluster = OpinionClusterNew.objects.get(
                        pk=opinion_fave_pk)
                    audio = None
                else:
                    cluster = None
                    audio = AudioNew.objects.get(pk=audio_fave_pk)
                new_favorite = FavoriteNew(
                    pk=old_favorite.pk,
                    user=new_user,
                    cluster_id=cluster,
                    audio_id=audio,
                    date_created=old_favorite.date_modified or now(),
                    date_modified=old_favorite.date_modified or now(),
                    name=old_favorite.name,
                    notes=old_favorite.notes,
                )
                new_favorite.save(using='default')

            for old_donation in old_donations:
                new_donation = DonationNew(
                    pk=old_donation.pk,
                    donor=new_user,
                    date_modified=old_donation.date_modified,
                    date_created=old_donation.date_created,
                    clearing_date=old_donation.clearing_date,
                    send_annual_reminder=old_donation.send_annual_reminder,
                    amount=old_donation.amount,
                    payment_provider=old_donation.payment_provider,
                    payment_id=old_donation.payment_id,
                    transaction_id=old_donation.transaction_id,
                    status=old_donation.status,
                    referrer=old_donation.referrer,
                )
                new_donation.save(using='default')

            progress += 1
            self._print_progress(progress, num_users)
        self.stdout.write(u'')  # Do a newline...

    def migrate_stats(self):
        self.stdout.write("Migrating stats to the new database...")
        # Stats use the same model in new and old, with no db_table definitions.
        # Makes life oh-so-easy.
        old_stats = Stat.objects.using('old').all()
        stat_count = old_stats.count()

        progress = 0
        self._print_progress(progress, stat_count)
        for old_stat in old_stats:
            old_stat.save(using='default')
            progress += 1
            self._print_progress(progress, stat_count)
        self.stdout.write(u'')  # Do a newline...
示例#11
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        print "Testing {count} scrapers against their example files:".format(
            count=num_scrapers)
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "juriscraper", "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = glob.glob('%s_example*' % example_path)
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                t2 = time.time()

                max_speed = 10
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print '(%s test(s) in %0.1f seconds%s)' % (
                    num_tests, speed, msg
                )

        print ("\n{num_scrapers} scrapers tested successfully against "
               "{num_example_files} example files, with {num_warnings} "
               "speed warnings.".format(
            num_scrapers=num_scrapers,
            num_example_files=num_example_files,
            num_warnings=num_warnings,
        ))
        if num_warnings:
            print ("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print "\nNo speed warnings detected. That's great, keep up the " \
                  "good work!"
示例#12
0
class PacerXMLParser(object):
    """A class to parse a PACER XML file"""

    cnt = CaseNameTweaker()

    def __init__(self, path):
        logger.info("Initializing parser for %s" % path)
        # High-level attributes
        self.path = path
        self.xml = self.get_xml_contents()
        self.case_details = self.get_case_details()
        self.document_list = self.get_document_list()
        self.document_count = self.get_document_count()

        # Docket attributes
        self.court = self.get_court()
        self.docket_number = self.get_str_from_node(self.case_details,
                                                    'docket_num')
        self.pacer_case_id = self.get_str_from_node(self.case_details,
                                                    'pacer_case_num')
        self.date_filed = self.get_datetime_from_node(self.case_details,
                                                      'date_case_filed',
                                                      cast_to_date=True)
        self.date_terminated = self.get_datetime_from_node(
            self.case_details, 'date_case_terminated', cast_to_date=True)
        self.date_last_filing = self.get_datetime_from_node(self.case_details,
                                                            'date_last_filing',
                                                            cast_to_date=True)
        self.case_name = harmonize(
            self.get_str_from_node(self.case_details, 'case_name'))
        self.case_name_short = self.cnt.make_case_name_short(self.case_name)
        self.cause = self.get_str_from_node(self.case_details, 'case_cause')
        self.nature_of_suit = self.get_str_from_node(self.case_details,
                                                     'nature_of_suit')
        self.jury_demand = self.get_str_from_node(self.case_details,
                                                  'jury_demand')
        self.jurisdiction_type = self.get_str_from_node(
            self.case_details, 'jurisdiction')
        self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to')
        self.referred_to, self.referred_to_str = self.get_judges('referred_to')
        self.blocked, self.date_blocked = self.set_blocked_fields()

        # Non-parsed fields
        self.filepath_local = os.path.join('recap', self.path)
        self.filepath_ia = get_docketxml_url_from_path(self.path)

    def save(self, debug):
        """Save the item to the database, updating any existing items.

        Returns None if an error occurs.
        """
        required_fields = ['case_name', 'date_filed']
        for field in required_fields:
            if not getattr(self, field):
                print "  Missing required field: %s" % field
                return None

        try:
            d = Docket.objects.get(
                Q(pacer_case_id=self.pacer_case_id)
                | Q(docket_number=self.docket_number),
                court=self.court,
            )
            # Add RECAP as a source if it's not already.
            if d.source in [Docket.DEFAULT, Docket.SCRAPER]:
                d.source = Docket.RECAP_AND_SCRAPER
            elif d.source == Docket.COLUMBIA:
                d.source = Docket.COLUMBIA_AND_RECAP
            elif d.source == Docket.COLUMBIA_AND_SCRAPER:
                d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER
        except Docket.DoesNotExist:
            d = Docket(source=Docket.RECAP)
        except Docket.MultipleObjectsReturned:
            print "  Got multiple results while attempting save."
            return None

        for attr, v in self.__dict__.items():
            setattr(d, attr, v)

        if not debug:
            d.save()
            print "  Saved as Docket %s: https://www.courtlistener.com%s" % (
                d.pk, d.get_absolute_url())
        return d

    def get_xml_contents(self):
        """Extract the XML from the file on disk and return it as an lxml
        tree
        """
        xml_parser = etree.XMLParser(recover=True)
        tree = etree.parse(self.path, xml_parser)

        return tree

    def get_case_details(self):
        """Most of the details are in the case_details node, so set it aside
        for faster parsing.
        """
        return self.xml.xpath('//case_details')[0]

    def get_document_list(self):
        """Get the XML nodes for the documents"""
        return self.xml.xpath('//document_list/document')

    def get_document_count(self):
        """Get the number of documents associated with this docket."""
        return len(self.document_list)

    def make_documents(self, docket, debug):
        """Parse through the document nodes, making good objects.

        For every node, create a line item on the Docket (a DocketEntry), and
        create 1..n additional RECAPDocuments (attachments or regular documents)
        that are associated with that DocketEntry.

        Returns None if an error occurs.
        """
        recap_docs = []
        for doc_node in self.document_list:
            # Make a DocketEntry object
            entry_number = doc_node.xpath('@doc_num')[0]
            attachment_number = int(doc_node.xpath('@attachment_num')[0])
            print "Working on document %s, attachment %s" % (entry_number,
                                                             attachment_number)

            if attachment_number == 0:
                document_type = RECAPDocument.PACER_DOCUMENT
            else:
                document_type = RECAPDocument.ATTACHMENT

            try:
                docket_entry = DocketEntry.objects.get(
                    docket=docket,
                    entry_number=entry_number,
                )
            except DocketEntry.DoesNotExist:
                if document_type == RECAPDocument.PACER_DOCUMENT:
                    docket_entry = DocketEntry(
                        docket=docket,
                        entry_number=entry_number,
                    )
                else:
                    logger.error(
                        "Tried to create attachment without a DocketEntry "
                        "object to associate it with.")
                    continue

            if document_type == RECAPDocument.PACER_DOCUMENT:
                date_filed = (self.get_datetime_from_node(
                    doc_node, 'date_filed', cast_to_date=True)
                              or docket_entry.date_filed)
                docket_entry.date_filed = date_filed
                docket_entry.description = (self.get_str_from_node(
                    doc_node, 'long_desc') or docket_entry.description)
                try:
                    if not debug:
                        docket_entry.save()
                except (IntegrityError,
                        DocketEntry.MultipleObjectsReturned) as e:
                    logger.error(
                        "Unable to create docket entry for docket #%s, on "
                        "entry: %s." % (docket, entry_number))
                    continue

            recap_doc = self.make_recap_document(
                doc_node,
                docket_entry,
                entry_number,
                attachment_number,
                document_type,
                debug,
            )
            if recap_doc is not None:
                recap_docs.append(recap_doc)

        return [item.pk for item in recap_docs]

    def make_recap_document(self, doc_node, docket_entry, entry_number,
                            attachment_number, document_type, debug):
        """Make a PACER document."""
        pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id')
        try:
            d = RECAPDocument.objects.get(
                docket_entry=docket_entry,
                document_number=entry_number,
                # Use the attachment number if it is not 0, else use None.
                attachment_number=attachment_number or None,
            )
        except RECAPDocument.DoesNotExist:
            d = RECAPDocument(
                docket_entry=docket_entry,
                pacer_doc_id=pacer_document_id,
            )
        else:
            d.pacer_doc_id = pacer_document_id or d.pacer_doc_id

        d.date_upload = self.get_datetime_from_node(doc_node, 'upload_date')
        d.document_type = document_type or d.document_type
        d.document_number = entry_number

        # If we can't parse the availability node (it returns None), default it
        # to False.
        availability = self.get_bool_from_node(doc_node, 'available')
        d.is_available = False if availability is None else availability
        d.sha1 = self.get_str_from_node(doc_node, 'sha1')
        d.description = (self.get_str_from_node(doc_node, 'short_desc')
                         or d.description)
        if d.is_available:
            d.filepath_ia = get_ia_document_url_from_path(
                self.path, entry_number, attachment_number)
            d.filepath_local = os.path.join(
                'recap',
                get_local_document_url_from_path(self.path, entry_number,
                                                 attachment_number),
            )
            if d.page_count is None:
                extension = d.filepath_local.path.split('.')[-1]
                d.page_count = get_page_count(d.filepath_local.path, extension)
        if document_type == RECAPDocument.ATTACHMENT:
            d.attachment_number = attachment_number
        if not debug:
            try:
                d.save(do_extraction=False, index=False)
            except IntegrityError as e:
                # This happens when a pacer_doc_id has been wrongly set as
                # the document_number, see for example, document 19 and
                # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml
                logger.error(
                    "Unable to create RECAPDocument for document #%s, "
                    "attachment #%s on entry: %s due to "
                    "IntegrityError." %
                    (d.document_number, d.attachment_number, d.docket_entry))
                return None
        return d

    def get_court(self):
        """Extract the court from the XML and return it as a Court object"""
        court_str = self.case_details.xpath('court/text()')[0].strip()
        try:
            c = Court.objects.get(pk=pacer_to_cl_ids.get(court_str, court_str))
        except Court.DoesNotExist:
            raise ParsingException("Unable to identify court: %s" % court_str)
        else:
            return c

    @staticmethod
    def get_bool_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
            n = int(s)
        except IndexError:
            print "  Couldn't get bool from path: %s" % path
            return None
        except ValueError:
            print(
                "  Couldn't convert text '%s' to int when making boolean "
                "for path: %s" % (s, path))
            return None
        else:
            return bool(n)

    @staticmethod
    def get_str_from_node(node, path):
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            print "  Couldn't get string from path: %s" % path
            return ''  # Return an empty string. Don't return None.
        else:
            return s

    def get_int_from_details(self, node):
        s = self.case_details.xpath('%s/text()' % node)[0].strip()
        try:
            return int(s)
        except ValueError:
            # Can't parse string to int
            print "  Couldn't get int for node %s" % node
            raise ParsingException("Cannot extract int for node %s" % node)

    @staticmethod
    def get_datetime_from_node(node, path, cast_to_date=False):
        """Parse a datetime from the XML located at node."""
        try:
            s = node.xpath('%s/text()' % path)[0].strip()
        except IndexError:
            print "  Couldn't get date from path: %s" % path
            return None
        else:
            try:
                d = parser.parse(s)
            except ValueError:
                print "  Couldn't parse date: %s" % s
                return None
            else:
                d = d.replace(tzinfo=d.tzinfo
                              or gettz('UTC'))  # Set it to UTC.
                if cast_to_date is True:
                    return d.date()
                return d

    def get_judges(self, node):
        """Parse out the judge string and then look it up in the DB"""
        try:
            s = self.case_details.xpath('%s/text()' % node)[0].strip()
        except IndexError:
            print "  Couldn't get judge for node: %s" % node
            return None, ''
        else:
            judge_names = find_judge_names(s)
            judges = []
            for judge_name in judge_names:
                judges.append(
                    find_person(judge_name,
                                self.court.pk,
                                case_date=self.date_filed))
            judges = [c for c in judges if c is not None]
            if len(judges) == 0:
                print "  No judges found after lookup."
                logger.info("No judge for: %s" %
                            ((s, self.court.pk, self.date_filed), ))
                return None, s
            elif len(judges) == 1:
                return judges[0], s
            elif len(judges) > 1:
                print "  Too many judges found: %s" % len(judges)
                return None, s

    def set_blocked_fields(self):
        """Set the blocked status for the Docket.

        Dockets are public (blocked is False) when:

                                   Is Bankr. Court
                                +---------+--------+
                                |   YES   |   NO   |
                +---------------+---------+--------+
         Size   | > 500 items   |    X    |    X   |
          of    +---------------+---------+--------+
        Docket  | <= 500 items  |         |    X   |
                +---------------+---------+--------+

        """
        bankruptcy_privacy_threshold = 500
        small_case = self.document_count <= bankruptcy_privacy_threshold
        if all([small_case, self.court.is_bankruptcy]):
            return True, date.today()
        return False, None
 def __init__(self, stdout=None, stderr=None, no_color=False):
     super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
     self.cnt = CaseNameTweaker()
class Command(VerboseCommand):
    help = 'Runs the Juriscraper toolkit against one or many jurisdictions.'

    def __init__(self, stdout=None, stderr=None, no_color=False):
        super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
        self.cnt = CaseNameTweaker()

    def add_arguments(self, parser):
        parser.add_argument(
            '--daemon',
            action='store_true',
            default=False,
            help=('Use this flag to turn on daemon mode, in which all '
                  'courts requested will be scraped in turn, '
                  'nonstop, in a loop.'),
        )
        parser.add_argument(
            '--rate',
            type=int,
            default=30,
            help=('The length of time in minutes it takes to crawl '
                  'all requested courts. Particularly useful if it is '
                  'desired to quickly scrape over all courts. Default '
                  'is 30 minutes.'),
        )
        parser.add_argument(
            '--courts',
            type=str,
            dest='court_id',
            metavar="COURTID",
            required=True,
            help=('The court(s) to scrape and extract. This should be '
                  'in the form of a python module or package import '
                  'from the Juriscraper library, e.g. '
                  '"juriscraper.opinions.united_states.federal_appellate.ca1" '
                  'or simply "opinions" to do all opinions.'),
        )
        parser.add_argument(
            '--fullcrawl',
            dest='full_crawl',
            action='store_true',
            default=False,
            help="Disable duplicate aborting.",
        )

    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error

    def save_everything(self, items, index=False, backscrape=False):
        """Saves all the sub items and associates them as appropriate.
        """
        docket, cluster = items['docket'], items['cluster']
        opinion, citations = items['opinion'], items['citations']
        docket.save()
        cluster.docket = docket
        cluster.save(index=False)  # Index only when the opinion is associated.

        for citation in citations:
            citation.cluster_id = cluster.pk
            citation.save()

        if cluster.judges:
            candidate_judges = get_candidate_judges(
                cluster.judges,
                docket.court.pk,
                cluster.date_filed,
            )
            if len(candidate_judges) == 1:
                opinion.author = candidate_judges[0]

            if len(candidate_judges) > 1:
                for candidate in candidate_judges:
                    cluster.panel.add(candidate)

        opinion.cluster = cluster
        opinion.save(index=index)
        if not backscrape:
            RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk)

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item['download_urls'],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest()
                if (court_str == 'nev' and
                        item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {'lookup_value': item['download_urls'],
                                     'lookup_by': 'download_url'}
                else:
                    lookup_params = {'lookup_value': sha1_hash,
                                     'lookup_by': 'sha1'}

                onwards = dup_checker.press_on(Opinion, current_date, next_date,
                                               **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content
                    )

                    if error:
                        download_error = True
                        continue

                    self.save_everything(
                        items={
                            'docket': docket,
                            'opinion': opinion,
                            'cluster': cluster,
                            'citations': citations,
                        },
                        index=False
                    )
                    extract_doc_content.delay(
                        opinion.pk, do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

    def parse_and_scrape_site(self, mod, full_crawl):
        site = mod.Site().parse()
        self.scrape_court(site, full_crawl)

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info("All jurisdictions done. Looping back to "
                                "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None and not IS_TRAVIS:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
示例#16
0
# Functions to parse court data in XML format into a list of dictionaries.
import os
import re
import xml.etree.cElementTree as ET

import dateutil.parser as dparser
from juriscraper.lib.string_utils import titlecase, harmonize, clean_string, CaseNameTweaker
from lxml import etree

from cl.corpus_importer.court_regexes import state_pairs
from cl.lib.crypto import sha1_of_file
from parse_judges import find_judge_names
from regexes_columbia import SPECIAL_REGEXES, FOLDER_DICT

# initialized once since it takes resources
CASE_NAME_TWEAKER = CaseNameTweaker()

# tags for which content will be condensed into plain text
SIMPLE_TAGS = [
    "reporter_caption", "citation", "caption", "court", "docket", "posture",
    "date", "hearing_date", "panel", "attorneys"
]

# regex that will be applied when condensing SIMPLE_TAGS content
STRIP_REGEX = [r'</?citation.*>', r'</?page_number.*>']

# types of opinions that will be parsed
# each may have a '_byline' and '_text' node
OPINION_TYPES = ['opinion', 'dissent', 'concurrence']

    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        json_compare_files_generated = []
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.enable_test_mode()
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        warn_generated_compare_file(json_path)
                        json_compare_files_generated.append(json_path)
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()
                duration = t2 - t1
                warning_msg = warn_or_crash_slow_parser(t2 - t1)
                if warning_msg:
                    num_warnings += 1

                print('(%s test(s) in %0.1f seconds)' %
                      (num_tests, duration))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if json_compare_files_generated:
            msg = 'Generated compare file(s) during test, please review before proceeding. ' \
                  'If the data looks good, run tests again, then be sure to include ' \
                  'the new compare file(s) in your commit: %s'
            self.fail(msg % ', '.join(json_compare_files_generated))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
示例#18
0
)
from cl.recap.models import (
    FjcIntegratedDatabase,
    PacerFetchQueue,
    PacerHtmlFiles,
    ProcessingQueue,
    PROCESSING_STATUS,
    REQUEST_TYPE,
    UPLOAD_TYPE,
)
from cl.scrapers.tasks import extract_recap_pdf, get_page_count
from cl.search.models import Docket, DocketEntry, RECAPDocument
from cl.search.tasks import add_or_update_recap_docket, add_items_to_solr

logger = logging.getLogger(__name__)
cnt = CaseNameTweaker()


def process_recap_upload(pq):
    """Process an item uploaded from an extension or API user.

    Uploaded objects can take a variety of forms, and we'll need to
    process them accordingly.
    """
    if pq.upload_type == UPLOAD_TYPE.DOCKET:
        chain(
            process_recap_docket.s(pq.pk), add_or_update_recap_docket.s()
        ).apply_async()
    elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE:
        process_recap_attachment.delay(pq.pk)
    elif pq.upload_type == UPLOAD_TYPE.PDF:
    def test_make_short_name(self):
        test_pairs = [
            # In re and Matter of
            ('In re Lissner', 'In re Lissner'),
            ('Matter of Lissner', 'Matter of Lissner'),

            # Plaintiff is in bad word list
            ('State v. Lissner', 'Lissner'),
            ('People v. Lissner', 'Lissner'),
            ('California v. Lissner', 'Lissner'),
            ('Dallas v. Lissner', 'Lissner'),

            # Basic 3-word case
            ('Langley v. Google', 'Langley'),
            # Similar to above, but more than 3 words
            ('Langley v. Google foo', 'Langley'),

            # United States v. ...
            ('United States v. Lissner', 'Lissner'),

            # Corporate first name
            ('Google, Inc. v. Langley', 'Langley'),
            ('Special, LLC v. Langley', 'Langley'),
            ('Google Corp. v. Langley', 'Langley'),

            # Shorter appellant than plaintiff
            ('Michael Lissner v. Langley', 'Langley'),

            # Multi-v with and w/o a bad_word
            ('Alameda v. Victor v. Keyboard', ''),
            ('Bloggers v. Victor v. Keyboard', ''),

            # Long left, short right
            ('Many words here v. Langley', 'Langley'),

            # Other manually added items
            ('Ilarion v. State', 'Ilarion'),
            ('Imery v. Vangil Ingenieros', 'Imery'),

            # Many more tests from real data!
            ('Bean v. City of Monahans', 'Bean'),
            ('Blanke v. Time, Inc.', 'Blanke'),
            ('New York Life Ins. Co. v. Deshotel', 'Deshotel'),
            ('Deatherage v. Deatherage', 'Deatherage'),
            ('Gonzalez Vargas v. Holder', ''),
            ('Campbell v. Wainwright', 'Campbell'),
            ('Liggett & Myers Tobacco Co. v. Finzer', 'Finzer'),
            ('United States v. Brenes', 'Brenes'),
            ('A.H. Robins Co., Inc. v. Eli Lilly & Co', ''),
            ('McKellar v. Hazen', 'McKellar'),
            ('Gil v. State', 'Gil'),
            ('Fuentes v. Owen', 'Fuentes'),
            ('State v. Shearer', 'Shearer'),
            ('United States v. Smither', 'Smither'),
            ('People v. Bradbury', 'Bradbury'),
            ('Venable (James) v. State', ''),
            ('Burkhardt v. Bailey', 'Burkhardt'),
            ('DeLorenzo v. Bales', 'DeLorenzo'),
            ('Loucks v. Bauman', 'Loucks'),
            ('Kenneth Stern v. Robert Weinstein', ''),
            ('Rayner v. Secretary of Health and Human Services', 'Rayner'),
            ('Rhyne v. Martin', 'Rhyne'),
            ('State v. Wolverton', 'Wolverton'),
            ('State v. Flood', 'Flood'),
            ('Amason v. Natural Gas Pipeline Co.', 'Amason'),
            ('United States v. Bryant', 'Bryant'),
            ('WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA', ''),
            ('Stewart v. Tupperware Corp.', 'Stewart'),
            ('Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY', ''),
            ('Stein v. State Tax Commission', 'Stein'),
            (
                'The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley',
                ''),
            ('People v. Armstrong', 'Armstrong'),
            ('Weeks v. Weeks', 'Weeks'),
            ('Smith v. Xerox Corp.', ''),
            ('In Interest of Ad', ''),
            ('People v. Forsyth', 'Forsyth'),
            ('State v. LeClair', 'LeClair'),
            ('Agristor Credit Corp. v. Unruh', 'Unruh'),
            ('United States v. Larry L. Stewart', ''),
            ('Starling v. United States', 'Starling'),
            ('United States v. Pablo Colin-Molina', ''),
            ('Kenneth N. Juhl v. The United States', ''),
            ('Matter of Wilson', 'Matter of Wilson'),
            ('In Re Damon H.', ''),
            ('Centennial Ins. Co. v. Zylberberg', 'Zylberberg'),
            ('United States v. Donald Lee Stotler', ''),
            ('Byndloss v. State', 'Byndloss'),
            ('People v. Piatkowski', 'Piatkowski'),
            ('United States v. Willie James Morgan', ''),
            ('Harbison (Debra) v. Thieret (James)', ''),
            ('Federal Land Bank of Columbia v. Lieben', 'Lieben'),
            ('John Willard Greywind v. John T. Podrebarac', ''),
            ('State v. Powell', 'Powell'),
            ('Carr v. Galloway', 'Carr'),
            ('Saylors v. State', 'Saylors'),
            ('Jones v. Franke', 'Jones'),
            ('In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde '
             'Titleholders, Inc., a California Corporation', ''),
            ('Pollenex Corporation v. Sunbeam-Home Comfort, a Division of '
             'Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing '
             'Corporation of North America', ''),
            ('Longs v. State', 'Longs'),
            ('Performance Network Solutions v. Cyberklix', 'Cyberklix'),
            ('DiSabatino v. Salicete', 'DiSabatino'),
            ('State v. Jennifer Nicole Jackson', ''),
            ('United States v. Moreno', 'Moreno'),
            ('LOGAN & KANAWHA COAL v. Banque Francaise', ''),
            ('State v. Harrison', 'Harrison'),
            ('Efford v. Milam', 'Efford'),
            ('People v. Thompson', 'Thompson'),
            ('CINCINNATI THERMAL SPRAY v. Pender County', ''),
            ('JAH Ex Rel. RMH v. Wadle & Associates', ''),
            ('United Pub. Employees v. CITY & CTY. OF SAN FRAN.', ''),
            ('Warren v. Massachusetts Indemnity', 'Warren'),
            ('Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             ''),
            ('Snowdon v. Grillo', 'Snowdon'),
            ('Adam Lunsford v. Cravens Funeral Home', ''),
            ('State v. Dillon', 'Dillon'),
            ('In Re Graham', 'In Re Graham'),
            ('Durham v. Chrysler Corp.', ''),  # Fails b/c Durham is a city!
            ('Carolyn Warrick v. Motiva Enterprises, L.L.C', ''),
            ('United States v. Aloi', 'Aloi'),
            ('United States Fidelity & Guaranty v. Graham', 'Graham'),
            ('Wildberger v. Rosenbaum', 'Wildberger'),
            ('Truck Insurance Exchange v. Michling', 'Michling'),
            ('Black Voters v. John J. McDonough', ''),
            ('State of Tennessee v. William F. Cain', ''),
            ('Robert J. Imbrogno v. Defense Logistics Agency', ''),
            ('Leetta Beachum, Administratrix v. Timothy Joseph White', ''),
            ('United States v. Jorge Gonzalez-Villegas', ''),
            ('Pitts v. Florida Bd. of Bar Examiners', 'Pitts'),
            ('State v. Pastushin', 'Pastushin'),
            ('Clark v. Clark', ''),
            ('Barrios v. Holder', 'Barrios'),
            ('Gregory L. Lavin v. United States', ''),
            ('Carpenter v. Consumers Power', 'Carpenter'),
            ('Derbabian v. S & C SNOWPLOWING, INC.', 'Derbabian'),
            ('Bright v. LSI CORP.', 'Bright'),
            ('State v. Brown', 'Brown'),
            ('KENNEY v. Keebler Co.', 'KENNEY'),
            ('Hill v. Chalanor', 'Hill'),
            ('Washington v. New Jersey', ''),
            ('Sollek v. Laseter', 'Sollek'),
            ('United States v. John Handy Jones, International Fidelity '
             'Insurance Company', ''),
            ('N.L.R.B. v. I. W. Corp', ''),
            ('Karpisek v. Cather & Sons Construction, Inc.', 'Karpisek'),
            ('Com. v. Wade', 'Com.'),
            ('Glascock v. Sukumlyn', 'Glascock'),
            ('Burroughs v. Hills', 'Burroughs'),
            ('State v. Darren Matthew Lee', ''),
            ('Mastondrea v. Occidental Hotels Management', 'Mastondrea'),
            ('Kent v. C. I. R', 'Kent'),
            ('Johnson v. City of Detroit', ''),
            ('Nolan v. United States', 'Nolan'),
            ('Currence v. Denver Tramway Corporation', 'Currence'),
            ('Matter of Cano', 'Matter of Cano'),
            # Two words after "Matter of --> Punt."
            ('Matter of Alphabet Soup', ''),
            # Zero words after "Matter of" --> Punt.
            ("Matter of", "Matter of"),
            ('Simmons v. Stalder', 'Simmons'),
            ('United States v. Donnell Hagood', ''),
            ('Kale v. United States INS', 'Kale'),
            ('Cmk v. Department of Revenue Ex Rel. Kb', 'Cmk'),
            ('State Farm Mut. Auto. Ins. Co. v. Barnes', 'Barnes'),
            ('In Re Krp', 'In Re Krp'),
            ('CH v. Department of Children and Families', 'CH'),
            ('Com. v. Monosky', 'Com.'),
            ('JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven', ''),
            ('Carolyn Humphrey v. Memorial Hospitals Association', ''),
            ('Wagner v. Sanders Associates, Inc.', 'Wagner'),
            ('United States v. Venie (Arthur G.)', ''),
            ('Mitchell v. State', ''),
            ('City of Biloxi, Miss. v. Giuffrida', 'Giuffrida'),
            ('Sexton v. St. Clair Federal Sav. Bank', 'Sexton'),
            ('United States v. Matthews', 'Matthews'),
            ('Freeman v. Freeman', 'Freeman'),
            ('Spencer v. Toussaint', 'Spencer'),
            ('In Re Canaday', 'In Re Canaday'),
            ('Wenger v. Commission on Judicial Performance', 'Wenger'),
            ('Jackson v. Janecka', 'Janecka'),
            ('People of Michigan v. Ryan Christopher Smith', ''),
            ('Kincade (Michael) v. State', ''),
            ('Tonubbee v. River Parishes Guide', 'Tonubbee'),
            ('United States v. Richiez', 'Richiez'),
            ('In Re Allamaras', 'In Re Allamaras'),
            ('United States v. Capoccia', 'Capoccia'),
            ('Com. v. DeFranco', 'Com.'),
            ('Matheny v. Porter', 'Matheny'),
            ('Piper v. Hoffman', 'Piper'),
            ('People v. Smith', ''),  # Punted b/c People and Smith are bad.
            ('Mobuary, Joseph v. State.', ''),  # Punted b/c "State." has punct
        ]
        tweaker = CaseNameTweaker()
        for t in test_pairs:
            output = tweaker.make_case_name_short(t[0])
            self.assertEqual(output, t[1],
                             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" %
                             (t[0], t[1], output))