Python CaseNameTweaker.make_case_name_short示例

编程语言: Python

命名空间/包名称: juriscraper.lib.string_utils

类/类型: CaseNameTweaker

方法/功能: make_case_name_short

hotexamples.com的示例: 4

Python CaseNameTweaker.make_case_name_short - 已找到4个示例。这些是从开源项目中提取的最受好评的juriscraper.lib.string_utils.CaseNameTweaker.make_case_name_short现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

CaseNameTweaker(10)

make_case_name_short(3)

示例#1

显示文件

文件： cl_scrape_opinions.py 项目： divergentdave/courtlistener

class Command(VerboseCommand):
    help = 'Runs the Juriscraper toolkit against one or many jurisdictions.'

    def __init__(self, stdout=None, stderr=None, no_color=False):
        super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
        self.cnt = CaseNameTweaker()

    def add_arguments(self, parser):
        parser.add_argument(
            '--daemon',
            action='store_true',
            default=False,
            help=('Use this flag to turn on daemon mode, in which all '
                  'courts requested will be scraped in turn, '
                  'nonstop, in a loop.'),
        )
        parser.add_argument(
            '--rate',
            type=int,
            default=30,
            help=('The length of time in minutes it takes to crawl '
                  'all requested courts. Particularly useful if it is '
                  'desired to quickly scrape over all courts. Default '
                  'is 30 minutes.'),
        )
        parser.add_argument(
            '--courts',
            type=str,
            dest='court_id',
            metavar="COURTID",
            required=True,
            help=('The court(s) to scrape and extract. This should be '
                  'in the form of a python module or package import '
                  'from the Juriscraper library, e.g. '
                  '"juriscraper.opinions.united_states.federal_appellate.ca1" '
                  'or simply "opinions" to do all opinions.'),
        )
        parser.add_argument(
            '--fullcrawl',
            dest='full_crawl',
            action='store_true',
            default=False,
            help="Disable duplicate aborting.",
        )

    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error

    def save_everything(self, items, index=False, backscrape=False):
        """Saves all the sub items and associates them as appropriate.
        """
        docket, cluster = items['docket'], items['cluster']
        opinion, citations = items['opinion'], items['citations']
        docket.save()
        cluster.docket = docket
        cluster.save(index=False)  # Index only when the opinion is associated.

        for citation in citations:
            citation.cluster_id = cluster.pk
            citation.save()

        if cluster.judges:
            candidate_judges = get_candidate_judges(
                cluster.judges,
                docket.court.pk,
                cluster.date_filed,
            )
            if len(candidate_judges) == 1:
                opinion.author = candidate_judges[0]

            if len(candidate_judges) > 1:
                for candidate in candidate_judges:
                    cluster.panel.add(candidate)

        opinion.cluster = cluster
        opinion.save(index=index)
        if not backscrape:
            RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk)

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(item['download_urls'],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                if (court_str == 'nev'
                        and item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {
                        'lookup_value': item['download_urls'],
                        'lookup_by': 'download_url'
                    }
                else:
                    lookup_params = {
                        'lookup_value': sha1_hash,
                        'lookup_by': 'sha1'
                    }

                onwards = dup_checker.press_on(Opinion, current_date,
                                               next_date, **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content)

                    if error:
                        download_error = True
                        continue

                    self.save_everything(items={
                        'docket': docket,
                        'opinion': opinion,
                        'cluster': cluster,
                        'citations': citations,
                    },
                                         index=False)
                    extract_doc_content.delay(
                        opinion.pk,
                        do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

    def parse_and_scrape_site(self, mod, full_crawl):
        site = mod.Site().parse()
        self.scrape_court(site, full_crawl)

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)

示例#2

显示文件

文件： test_everything.py 项目： tigrankhachikyan/juriscraper

    def test_make_short_name(self):
        test_pairs = [
            # In re and Matter of
            ('In re Lissner', 'In re Lissner'),
            ('Matter of Lissner', 'Matter of Lissner'),

            # Plaintiff is in bad word list
            ('State v. Lissner', 'Lissner'),
            ('People v. Lissner', 'Lissner'),
            ('California v. Lissner', 'Lissner'),
            ('Dallas v. Lissner', 'Lissner'),

            # Basic 3-word case
            ('Langley v. Google', 'Langley'),
            # Similar to above, but more than 3 words
            ('Langley v. Google foo', 'Langley'),

            # United States v. ...
            ('United States v. Lissner', 'Lissner'),

            # Corporate first name
            ('Google, Inc. v. Langley', 'Langley'),
            ('Special, LLC v. Langley', 'Langley'),
            ('Google Corp. v. Langley', 'Langley'),

            # Shorter appellant than plaintiff
            ('Michael Lissner v. Langley', 'Langley'),

            # Multi-v with and w/o a bad_word
            ('Alameda v. Victor v. Keyboard', ''),
            ('Bloggers v. Victor v. Keyboard', ''),

            # Long left, short right
            ('Many words here v. Langley', 'Langley'),

            # Other manually added items
            ('Ilarion v. State', 'Ilarion'),
            ('Imery v. Vangil Ingenieros', 'Imery'),

            # Many more tests from real data!
            ('Bean v. City of Monahans', 'Bean'),
            ('Blanke v. Time, Inc.', 'Blanke'),
            ('New York Life Ins. Co. v. Deshotel', 'Deshotel'),
            ('Deatherage v. Deatherage', 'Deatherage'),
            ('Gonzalez Vargas v. Holder', ''),
            ('Campbell v. Wainwright', 'Campbell'),
            ('Liggett & Myers Tobacco Co. v. Finzer', 'Finzer'),
            ('United States v. Brenes', 'Brenes'),
            ('A.H. Robins Co., Inc. v. Eli Lilly & Co', ''),
            ('McKellar v. Hazen', 'McKellar'),
            ('Gil v. State', 'Gil'),
            ('Fuentes v. Owen', 'Fuentes'),
            ('State v. Shearer', 'Shearer'),
            ('United States v. Smither', 'Smither'),
            ('People v. Bradbury', 'Bradbury'),
            ('Venable (James) v. State', ''),
            ('Burkhardt v. Bailey', 'Burkhardt'),
            ('DeLorenzo v. Bales', 'DeLorenzo'),
            ('Loucks v. Bauman', 'Loucks'),
            ('Kenneth Stern v. Robert Weinstein', ''),
            ('Rayner v. Secretary of Health and Human Services', 'Rayner'),
            ('Rhyne v. Martin', 'Rhyne'),
            ('State v. Wolverton', 'Wolverton'),
            ('State v. Flood', 'Flood'),
            ('Amason v. Natural Gas Pipeline Co.', 'Amason'),
            ('United States v. Bryant', 'Bryant'),
            ('WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA', ''),
            ('Stewart v. Tupperware Corp.', 'Stewart'),
            ('Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY', ''),
            ('Stein v. State Tax Commission', 'Stein'),
            (
                'The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley',
                ''),
            ('People v. Armstrong', 'Armstrong'),
            ('Weeks v. Weeks', 'Weeks'),
            ('Smith v. Xerox Corp.', ''),
            ('In Interest of Ad', ''),
            ('People v. Forsyth', 'Forsyth'),
            ('State v. LeClair', 'LeClair'),
            ('Agristor Credit Corp. v. Unruh', 'Unruh'),
            ('United States v. Larry L. Stewart', ''),
            ('Starling v. United States', 'Starling'),
            ('United States v. Pablo Colin-Molina', ''),
            ('Kenneth N. Juhl v. The United States', ''),
            ('Matter of Wilson', 'Matter of Wilson'),
            ('In Re Damon H.', ''),
            ('Centennial Ins. Co. v. Zylberberg', 'Zylberberg'),
            ('United States v. Donald Lee Stotler', ''),
            ('Byndloss v. State', 'Byndloss'),
            ('People v. Piatkowski', 'Piatkowski'),
            ('United States v. Willie James Morgan', ''),
            ('Harbison (Debra) v. Thieret (James)', ''),
            ('Federal Land Bank of Columbia v. Lieben', 'Lieben'),
            ('John Willard Greywind v. John T. Podrebarac', ''),
            ('State v. Powell', 'Powell'),
            ('Carr v. Galloway', 'Carr'),
            ('Saylors v. State', 'Saylors'),
            ('Jones v. Franke', 'Jones'),
            ('In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde '
             'Titleholders, Inc., a California Corporation', ''),
            ('Pollenex Corporation v. Sunbeam-Home Comfort, a Division of '
             'Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing '
             'Corporation of North America', ''),
            ('Longs v. State', 'Longs'),
            ('Performance Network Solutions v. Cyberklix', 'Cyberklix'),
            ('DiSabatino v. Salicete', 'DiSabatino'),
            ('State v. Jennifer Nicole Jackson', ''),
            ('United States v. Moreno', 'Moreno'),
            ('LOGAN & KANAWHA COAL v. Banque Francaise', ''),
            ('State v. Harrison', 'Harrison'),
            ('Efford v. Milam', 'Efford'),
            ('People v. Thompson', 'Thompson'),
            ('CINCINNATI THERMAL SPRAY v. Pender County', ''),
            ('JAH Ex Rel. RMH v. Wadle & Associates', ''),
            ('United Pub. Employees v. CITY & CTY. OF SAN FRAN.', ''),
            ('Warren v. Massachusetts Indemnity', 'Warren'),
            ('Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             ''),
            ('Snowdon v. Grillo', 'Snowdon'),
            ('Adam Lunsford v. Cravens Funeral Home', ''),
            ('State v. Dillon', 'Dillon'),
            ('In Re Graham', 'In Re Graham'),
            ('Durham v. Chrysler Corp.', ''),  # Fails b/c Durham is a city!
            ('Carolyn Warrick v. Motiva Enterprises, L.L.C', ''),
            ('United States v. Aloi', 'Aloi'),
            ('United States Fidelity & Guaranty v. Graham', 'Graham'),
            ('Wildberger v. Rosenbaum', 'Wildberger'),
            ('Truck Insurance Exchange v. Michling', 'Michling'),
            ('Black Voters v. John J. McDonough', ''),
            ('State of Tennessee v. William F. Cain', ''),
            ('Robert J. Imbrogno v. Defense Logistics Agency', ''),
            ('Leetta Beachum, Administratrix v. Timothy Joseph White', ''),
            ('United States v. Jorge Gonzalez-Villegas', ''),
            ('Pitts v. Florida Bd. of Bar Examiners', 'Pitts'),
            ('State v. Pastushin', 'Pastushin'),
            ('Clark v. Clark', ''),
            ('Barrios v. Holder', 'Barrios'),
            ('Gregory L. Lavin v. United States', ''),
            ('Carpenter v. Consumers Power', 'Carpenter'),
            ('Derbabian v. S & C SNOWPLOWING, INC.', 'Derbabian'),
            ('Bright v. LSI CORP.', 'Bright'),
            ('State v. Brown', 'Brown'),
            ('KENNEY v. Keebler Co.', 'KENNEY'),
            ('Hill v. Chalanor', 'Hill'),
            ('Washington v. New Jersey', ''),
            ('Sollek v. Laseter', 'Sollek'),
            ('United States v. John Handy Jones, International Fidelity '
             'Insurance Company', ''),
            ('N.L.R.B. v. I. W. Corp', ''),
            ('Karpisek v. Cather & Sons Construction, Inc.', 'Karpisek'),
            ('Com. v. Wade', 'Com.'),
            ('Glascock v. Sukumlyn', 'Glascock'),
            ('Burroughs v. Hills', 'Burroughs'),
            ('State v. Darren Matthew Lee', ''),
            ('Mastondrea v. Occidental Hotels Management', 'Mastondrea'),
            ('Kent v. C. I. R', 'Kent'),
            ('Johnson v. City of Detroit', ''),
            ('Nolan v. United States', 'Nolan'),
            ('Currence v. Denver Tramway Corporation', 'Currence'),
            ('Matter of Cano', 'Matter of Cano'),
            # Two words after "Matter of --> Punt."
            ('Matter of Alphabet Soup', ''),
            # Zero words after "Matter of" --> Punt.
            ("Matter of", "Matter of"),
            ('Simmons v. Stalder', 'Simmons'),
            ('United States v. Donnell Hagood', ''),
            ('Kale v. United States INS', 'Kale'),
            ('Cmk v. Department of Revenue Ex Rel. Kb', 'Cmk'),
            ('State Farm Mut. Auto. Ins. Co. v. Barnes', 'Barnes'),
            ('In Re Krp', 'In Re Krp'),
            ('CH v. Department of Children and Families', 'CH'),
            ('Com. v. Monosky', 'Com.'),
            ('JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven', ''),
            ('Carolyn Humphrey v. Memorial Hospitals Association', ''),
            ('Wagner v. Sanders Associates, Inc.', 'Wagner'),
            ('United States v. Venie (Arthur G.)', ''),
            ('Mitchell v. State', ''),
            ('City of Biloxi, Miss. v. Giuffrida', 'Giuffrida'),
            ('Sexton v. St. Clair Federal Sav. Bank', 'Sexton'),
            ('United States v. Matthews', 'Matthews'),
            ('Freeman v. Freeman', 'Freeman'),
            ('Spencer v. Toussaint', 'Spencer'),
            ('In Re Canaday', 'In Re Canaday'),
            ('Wenger v. Commission on Judicial Performance', 'Wenger'),
            ('Jackson v. Janecka', 'Janecka'),
            ('People of Michigan v. Ryan Christopher Smith', ''),
            ('Kincade (Michael) v. State', ''),
            ('Tonubbee v. River Parishes Guide', 'Tonubbee'),
            ('United States v. Richiez', 'Richiez'),
            ('In Re Allamaras', 'In Re Allamaras'),
            ('United States v. Capoccia', 'Capoccia'),
            ('Com. v. DeFranco', 'Com.'),
            ('Matheny v. Porter', 'Matheny'),
            ('Piper v. Hoffman', 'Piper'),
            ('People v. Smith', ''),  # Punted b/c People and Smith are bad.
            ('Mobuary, Joseph v. State.', ''),  # Punted b/c "State." has punct
        ]
        tweaker = CaseNameTweaker()
        for t in test_pairs:
            output = tweaker.make_case_name_short(t[0])
            self.assertEqual(output, t[1],
                             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" %
                             (t[0], t[1], output))

示例#3

显示文件

文件： test_StringUtilTest.py 项目： rileyfroehlich/juriscraper

 def test_make_short_name(self):
     test_pairs = [
         # In re and Matter of
         ("In re Lissner", "In re Lissner"),
         ("Matter of Lissner", "Matter of Lissner"),
         # Plaintiff is in bad word list
         ("State v. Lissner", "Lissner"),
         ("People v. Lissner", "Lissner"),
         ("California v. Lissner", "Lissner"),
         ("Dallas v. Lissner", "Lissner"),
         # Basic 3-word case
         ("Langley v. Google", "Langley"),
         # Similar to above, but more than 3 words
         ("Langley v. Google foo", "Langley"),
         # United States v. ...
         ("United States v. Lissner", "Lissner"),
         # Corporate first name
         ("Google, Inc. v. Langley", "Langley"),
         ("Special, LLC v. Langley", "Langley"),
         ("Google Corp. v. Langley", "Langley"),
         # Shorter appellant than plaintiff
         ("Michael Lissner v. Langley", "Langley"),
         # Multi-v with and w/o a bad_word
         ("Alameda v. Victor v. Keyboard", ""),
         ("Bloggers v. Victor v. Keyboard", ""),
         # Long left, short right
         ("Many words here v. Langley", "Langley"),
         # Other manually added items
         ("Ilarion v. State", "Ilarion"),
         ("Imery v. Vangil Ingenieros", "Imery"),
         # Many more tests from real data!
         ("Bean v. City of Monahans", "Bean"),
         ("Blanke v. Time, Inc.", "Blanke"),
         ("New York Life Ins. Co. v. Deshotel", "Deshotel"),
         ("Deatherage v. Deatherage", "Deatherage"),
         ("Gonzalez Vargas v. Holder", ""),
         ("Campbell v. Wainwright", "Campbell"),
         ("Liggett & Myers Tobacco Co. v. Finzer", "Finzer"),
         ("United States v. Brenes", "Brenes"),
         ("A.H. Robins Co., Inc. v. Eli Lilly & Co", ""),
         ("McKellar v. Hazen", "McKellar"),
         ("Gil v. State", "Gil"),
         ("Fuentes v. Owen", "Fuentes"),
         ("State v. Shearer", "Shearer"),
         ("United States v. Smither", "Smither"),
         ("People v. Bradbury", "Bradbury"),
         ("Venable (James) v. State", ""),
         ("Burkhardt v. Bailey", "Burkhardt"),
         ("DeLorenzo v. Bales", "DeLorenzo"),
         ("Loucks v. Bauman", "Loucks"),
         ("Kenneth Stern v. Robert Weinstein", ""),
         ("Rayner v. Secretary of Health and Human Services", "Rayner"),
         ("Rhyne v. Martin", "Rhyne"),
         ("State v. Wolverton", "Wolverton"),
         ("State v. Flood", "Flood"),
         ("Amason v. Natural Gas Pipeline Co.", "Amason"),
         ("United States v. Bryant", "Bryant"),
         ("WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA", ""),
         ("Stewart v. Tupperware Corp.", "Stewart"),
         ("Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY", ""),
         ("Stein v. State Tax Commission", "Stein"),
         (
             "The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley",
             "",
         ),
         ("People v. Armstrong", "Armstrong"),
         ("Weeks v. Weeks", "Weeks"),
         ("Smith v. Xerox Corp.", ""),
         ("In Interest of Ad", ""),
         ("People v. Forsyth", "Forsyth"),
         ("State v. LeClair", "LeClair"),
         ("Agristor Credit Corp. v. Unruh", "Unruh"),
         ("United States v. Larry L. Stewart", ""),
         ("Starling v. United States", "Starling"),
         ("United States v. Pablo Colin-Molina", ""),
         ("Kenneth N. Juhl v. The United States", ""),
         ("Matter of Wilson", "Matter of Wilson"),
         ("In Re Damon H.", ""),
         ("Centennial Ins. Co. v. Zylberberg", "Zylberberg"),
         ("United States v. Donald Lee Stotler", ""),
         ("Byndloss v. State", "Byndloss"),
         ("People v. Piatkowski", "Piatkowski"),
         ("United States v. Willie James Morgan", ""),
         ("Harbison (Debra) v. Thieret (James)", ""),
         ("Federal Land Bank of Columbia v. Lieben", "Lieben"),
         ("John Willard Greywind v. John T. Podrebarac", ""),
         ("State v. Powell", "Powell"),
         ("Carr v. Galloway", "Carr"),
         ("Saylors v. State", "Saylors"),
         ("Jones v. Franke", "Jones"),
         (
             "In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde "
             "Titleholders, Inc., a California Corporation",
             "",
         ),
         (
             "Pollenex Corporation v. Sunbeam-Home Comfort, a Division of "
             "Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing "
             "Corporation of North America",
             "",
         ),
         ("Longs v. State", "Longs"),
         ("Performance Network Solutions v. Cyberklix", "Cyberklix"),
         ("DiSabatino v. Salicete", "DiSabatino"),
         ("State v. Jennifer Nicole Jackson", ""),
         ("United States v. Moreno", "Moreno"),
         ("LOGAN & KANAWHA COAL v. Banque Francaise", ""),
         ("State v. Harrison", "Harrison"),
         ("Efford v. Milam", "Efford"),
         ("People v. Thompson", "Thompson"),
         ("CINCINNATI THERMAL SPRAY v. Pender County", ""),
         ("JAH Ex Rel. RMH v. Wadle & Associates", ""),
         ("United Pub. Employees v. CITY & CTY. OF SAN FRAN.", ""),
         ("Warren v. Massachusetts Indemnity", "Warren"),
         (
             'Marion Edwards v. State Farm Insurance Company and "John Doe,"',
             "",
         ),
         ("Snowdon v. Grillo", "Snowdon"),
         ("Adam Lunsford v. Cravens Funeral Home", ""),
         ("State v. Dillon", "Dillon"),
         ("In Re Graham", "In Re Graham"),
         ("Durham v. Chrysler Corp.", ""),  # Fails b/c Durham is a city!
         ("Carolyn Warrick v. Motiva Enterprises, L.L.C", ""),
         ("United States v. Aloi", "Aloi"),
         ("United States Fidelity & Guaranty v. Graham", "Graham"),
         ("Wildberger v. Rosenbaum", "Wildberger"),
         ("Truck Insurance Exchange v. Michling", "Michling"),
         ("Black Voters v. John J. McDonough", ""),
         ("State of Tennessee v. William F. Cain", ""),
         ("Robert J. Imbrogno v. Defense Logistics Agency", ""),
         ("Leetta Beachum, Administratrix v. Timothy Joseph White", ""),
         ("United States v. Jorge Gonzalez-Villegas", ""),
         ("Pitts v. Florida Bd. of Bar Examiners", "Pitts"),
         ("State v. Pastushin", "Pastushin"),
         ("Clark v. Clark", ""),
         ("Barrios v. Holder", "Barrios"),
         ("Gregory L. Lavin v. United States", ""),
         ("Carpenter v. Consumers Power", "Carpenter"),
         ("Derbabian v. S & C SNOWPLOWING, INC.", "Derbabian"),
         ("Bright v. LSI CORP.", "Bright"),
         ("State v. Brown", "Brown"),
         ("KENNEY v. Keebler Co.", "KENNEY"),
         ("Hill v. Chalanor", "Hill"),
         ("Washington v. New Jersey", ""),
         ("Sollek v. Laseter", "Sollek"),
         (
             "United States v. John Handy Jones, International Fidelity "
             "Insurance Company",
             "",
         ),
         ("N.L.R.B. v. I. W. Corp", ""),
         ("Karpisek v. Cather & Sons Construction, Inc.", "Karpisek"),
         ("Com. v. Wade", "Com."),
         ("Glascock v. Sukumlyn", "Glascock"),
         ("Burroughs v. Hills", "Burroughs"),
         ("State v. Darren Matthew Lee", ""),
         ("Mastondrea v. Occidental Hotels Management", "Mastondrea"),
         ("Kent v. C. I. R", "Kent"),
         ("Johnson v. City of Detroit", ""),
         ("Nolan v. United States", "Nolan"),
         ("Currence v. Denver Tramway Corporation", "Currence"),
         ("Matter of Cano", "Matter of Cano"),
         # Two words after "Matter of --> Punt."
         ("Matter of Alphabet Soup", ""),
         # Zero words after "Matter of" --> Punt.
         ("Matter of", "Matter of"),
         ("Simmons v. Stalder", "Simmons"),
         ("United States v. Donnell Hagood", ""),
         ("Kale v. United States INS", "Kale"),
         ("Cmk v. Department of Revenue Ex Rel. Kb", "Cmk"),
         ("State Farm Mut. Auto. Ins. Co. v. Barnes", "Barnes"),
         ("In Re Krp", "In Re Krp"),
         ("CH v. Department of Children and Families", "CH"),
         ("Com. v. Monosky", "Com."),
         ("JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven", ""),
         ("Carolyn Humphrey v. Memorial Hospitals Association", ""),
         ("Wagner v. Sanders Associates, Inc.", "Wagner"),
         ("United States v. Venie (Arthur G.)", ""),
         ("Mitchell v. State", ""),
         ("City of Biloxi, Miss. v. Giuffrida", "Giuffrida"),
         ("Sexton v. St. Clair Federal Sav. Bank", "Sexton"),
         ("United States v. Matthews", "Matthews"),
         ("Freeman v. Freeman", "Freeman"),
         ("Spencer v. Toussaint", "Spencer"),
         ("In Re Canaday", "In Re Canaday"),
         ("Wenger v. Commission on Judicial Performance", "Wenger"),
         ("Jackson v. Janecka", "Janecka"),
         ("People of Michigan v. Ryan Christopher Smith", ""),
         ("Kincade (Michael) v. State", ""),
         ("Tonubbee v. River Parishes Guide", "Tonubbee"),
         ("United States v. Richiez", "Richiez"),
         ("In Re Allamaras", "In Re Allamaras"),
         ("United States v. Capoccia", "Capoccia"),
         ("Com. v. DeFranco", "Com."),
         ("Matheny v. Porter", "Matheny"),
         ("Piper v. Hoffman", "Piper"),
         ("People v. Smith", ""),  # Punted b/c People and Smith are bad.
         ("Mobuary, Joseph v. State.", ""),  # Punted b/c "State." has punct
     ]
     tweaker = CaseNameTweaker()
     for t in test_pairs:
         output = tweaker.make_case_name_short(t[0])
         self.assertEqual(
             output,
             t[1],
             "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'"
             % (t[0], t[1], output),
         )

示例#4

显示文件

文件： cl_scrape_opinions.py 项目： freelawproject/courtlistener

class Command(VerboseCommand):
    help = 'Runs the Juriscraper toolkit against one or many jurisdictions.'

    def __init__(self, stdout=None, stderr=None, no_color=False):
        super(Command, self).__init__(stdout=None, stderr=None, no_color=False)
        self.cnt = CaseNameTweaker()

    def add_arguments(self, parser):
        parser.add_argument(
            '--daemon',
            action='store_true',
            default=False,
            help=('Use this flag to turn on daemon mode, in which all '
                  'courts requested will be scraped in turn, '
                  'nonstop, in a loop.'),
        )
        parser.add_argument(
            '--rate',
            type=int,
            default=30,
            help=('The length of time in minutes it takes to crawl '
                  'all requested courts. Particularly useful if it is '
                  'desired to quickly scrape over all courts. Default '
                  'is 30 minutes.'),
        )
        parser.add_argument(
            '--courts',
            type=str,
            dest='court_id',
            metavar="COURTID",
            required=True,
            help=('The court(s) to scrape and extract. This should be '
                  'in the form of a python module or package import '
                  'from the Juriscraper library, e.g. '
                  '"juriscraper.opinions.united_states.federal_appellate.ca1" '
                  'or simply "opinions" to do all opinions.'),
        )
        parser.add_argument(
            '--fullcrawl',
            dest='full_crawl',
            action='store_true',
            default=False,
            help="Disable duplicate aborting.",
        )

    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error

    def save_everything(self, items, index=False, backscrape=False):
        """Saves all the sub items and associates them as appropriate.
        """
        docket, cluster = items['docket'], items['cluster']
        opinion, citations = items['opinion'], items['citations']
        docket.save()
        cluster.docket = docket
        cluster.save(index=False)  # Index only when the opinion is associated.

        for citation in citations:
            citation.cluster_id = cluster.pk
            citation.save()

        if cluster.judges:
            candidate_judges = get_candidate_judges(
                cluster.judges,
                docket.court.pk,
                cluster.date_filed,
            )
            if len(candidate_judges) == 1:
                opinion.author = candidate_judges[0]

            if len(candidate_judges) > 1:
                for candidate in candidate_judges:
                    cluster.panel.add(candidate)

        opinion.cluster = cluster
        opinion.save(index=index)
        if not backscrape:
            RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk)

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item['download_urls'],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest()
                if (court_str == 'nev' and
                        item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {'lookup_value': item['download_urls'],
                                     'lookup_by': 'download_url'}
                else:
                    lookup_params = {'lookup_value': sha1_hash,
                                     'lookup_by': 'sha1'}

                onwards = dup_checker.press_on(Opinion, current_date, next_date,
                                               **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content
                    )

                    if error:
                        download_error = True
                        continue

                    self.save_everything(
                        items={
                            'docket': docket,
                            'opinion': opinion,
                            'cluster': cluster,
                            'citations': citations,
                        },
                        index=False
                    )
                    extract_doc_content.delay(
                        opinion.pk, do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

    def parse_and_scrape_site(self, mod, full_crawl):
        site = mod.Site().parse()
        self.scrape_court(site, full_crawl)

    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info("All jurisdictions done. Looping back to "
                                "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)