class Command(VerboseCommand): help = 'Runs the Juriscraper toolkit against one or many jurisdictions.' def __init__(self, stdout=None, stderr=None, no_color=False): super(Command, self).__init__(stdout=None, stderr=None, no_color=False) self.cnt = CaseNameTweaker() def add_arguments(self, parser): parser.add_argument( '--daemon', action='store_true', default=False, help=('Use this flag to turn on daemon mode, in which all ' 'courts requested will be scraped in turn, ' 'nonstop, in a loop.'), ) parser.add_argument( '--rate', type=int, default=30, help=('The length of time in minutes it takes to crawl ' 'all requested courts. Particularly useful if it is ' 'desired to quickly scrape over all courts. Default ' 'is 30 minutes.'), ) parser.add_argument( '--courts', type=str, dest='court_id', metavar="COURTID", required=True, help=('The court(s) to scrape and extract. This should be ' 'in the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal_appellate.ca1" ' 'or simply "opinions" to do all opinions.'), ) parser.add_argument( '--fullcrawl', dest='full_crawl', action='store_true', default=False, help="Disable duplicate aborting.", ) def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error def save_everything(self, items, index=False, backscrape=False): """Saves all the sub items and associates them as appropriate. """ docket, cluster = items['docket'], items['cluster'] opinion, citations = items['opinion'], items['citations'] docket.save() cluster.docket = docket cluster.save(index=False) # Index only when the opinion is associated. for citation in citations: citation.cluster_id = cluster.pk citation.save() if cluster.judges: candidate_judges = get_candidate_judges( cluster.judges, docket.court.pk, cluster.date_filed, ) if len(candidate_judges) == 1: opinion.author = candidate_judges[0] if len(candidate_judges) > 1: for candidate in candidate_judges: cluster.panel.add(candidate) opinion.cluster = cluster opinion.save(index=index) if not backscrape: RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk) def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content(item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = { 'lookup_value': item['download_urls'], 'lookup_by': 'download_url' } else: lookup_params = { 'lookup_value': sha1_hash, 'lookup_by': 'sha1' } onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content) if error: download_error = True continue self.save_everything(items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash) def parse_and_scrape_site(self, mod, full_crawl): site = mod.Site().parse() self.scrape_court(site, full_crawl) def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception as e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split( '_')[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and options['daemon']: # Start over... logger.info( "All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)
def test_make_short_name(self): test_pairs = [ # In re and Matter of ('In re Lissner', 'In re Lissner'), ('Matter of Lissner', 'Matter of Lissner'), # Plaintiff is in bad word list ('State v. Lissner', 'Lissner'), ('People v. Lissner', 'Lissner'), ('California v. Lissner', 'Lissner'), ('Dallas v. Lissner', 'Lissner'), # Basic 3-word case ('Langley v. Google', 'Langley'), # Similar to above, but more than 3 words ('Langley v. Google foo', 'Langley'), # United States v. ... ('United States v. Lissner', 'Lissner'), # Corporate first name ('Google, Inc. v. Langley', 'Langley'), ('Special, LLC v. Langley', 'Langley'), ('Google Corp. v. Langley', 'Langley'), # Shorter appellant than plaintiff ('Michael Lissner v. Langley', 'Langley'), # Multi-v with and w/o a bad_word ('Alameda v. Victor v. Keyboard', ''), ('Bloggers v. Victor v. Keyboard', ''), # Long left, short right ('Many words here v. Langley', 'Langley'), # Other manually added items ('Ilarion v. State', 'Ilarion'), ('Imery v. Vangil Ingenieros', 'Imery'), # Many more tests from real data! ('Bean v. City of Monahans', 'Bean'), ('Blanke v. Time, Inc.', 'Blanke'), ('New York Life Ins. Co. v. Deshotel', 'Deshotel'), ('Deatherage v. Deatherage', 'Deatherage'), ('Gonzalez Vargas v. Holder', ''), ('Campbell v. Wainwright', 'Campbell'), ('Liggett & Myers Tobacco Co. v. Finzer', 'Finzer'), ('United States v. Brenes', 'Brenes'), ('A.H. Robins Co., Inc. v. Eli Lilly & Co', ''), ('McKellar v. Hazen', 'McKellar'), ('Gil v. State', 'Gil'), ('Fuentes v. Owen', 'Fuentes'), ('State v. Shearer', 'Shearer'), ('United States v. Smither', 'Smither'), ('People v. Bradbury', 'Bradbury'), ('Venable (James) v. State', ''), ('Burkhardt v. Bailey', 'Burkhardt'), ('DeLorenzo v. Bales', 'DeLorenzo'), ('Loucks v. Bauman', 'Loucks'), ('Kenneth Stern v. Robert Weinstein', ''), ('Rayner v. Secretary of Health and Human Services', 'Rayner'), ('Rhyne v. Martin', 'Rhyne'), ('State v. Wolverton', 'Wolverton'), ('State v. Flood', 'Flood'), ('Amason v. Natural Gas Pipeline Co.', 'Amason'), ('United States v. Bryant', 'Bryant'), ('WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA', ''), ('Stewart v. Tupperware Corp.', 'Stewart'), ('Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY', ''), ('Stein v. State Tax Commission', 'Stein'), ( 'The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley', ''), ('People v. Armstrong', 'Armstrong'), ('Weeks v. Weeks', 'Weeks'), ('Smith v. Xerox Corp.', ''), ('In Interest of Ad', ''), ('People v. Forsyth', 'Forsyth'), ('State v. LeClair', 'LeClair'), ('Agristor Credit Corp. v. Unruh', 'Unruh'), ('United States v. Larry L. Stewart', ''), ('Starling v. United States', 'Starling'), ('United States v. Pablo Colin-Molina', ''), ('Kenneth N. Juhl v. The United States', ''), ('Matter of Wilson', 'Matter of Wilson'), ('In Re Damon H.', ''), ('Centennial Ins. Co. v. Zylberberg', 'Zylberberg'), ('United States v. Donald Lee Stotler', ''), ('Byndloss v. State', 'Byndloss'), ('People v. Piatkowski', 'Piatkowski'), ('United States v. Willie James Morgan', ''), ('Harbison (Debra) v. Thieret (James)', ''), ('Federal Land Bank of Columbia v. Lieben', 'Lieben'), ('John Willard Greywind v. John T. Podrebarac', ''), ('State v. Powell', 'Powell'), ('Carr v. Galloway', 'Carr'), ('Saylors v. State', 'Saylors'), ('Jones v. Franke', 'Jones'), ('In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde ' 'Titleholders, Inc., a California Corporation', ''), ('Pollenex Corporation v. Sunbeam-Home Comfort, a Division of ' 'Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing ' 'Corporation of North America', ''), ('Longs v. State', 'Longs'), ('Performance Network Solutions v. Cyberklix', 'Cyberklix'), ('DiSabatino v. Salicete', 'DiSabatino'), ('State v. Jennifer Nicole Jackson', ''), ('United States v. Moreno', 'Moreno'), ('LOGAN & KANAWHA COAL v. Banque Francaise', ''), ('State v. Harrison', 'Harrison'), ('Efford v. Milam', 'Efford'), ('People v. Thompson', 'Thompson'), ('CINCINNATI THERMAL SPRAY v. Pender County', ''), ('JAH Ex Rel. RMH v. Wadle & Associates', ''), ('United Pub. Employees v. CITY & CTY. OF SAN FRAN.', ''), ('Warren v. Massachusetts Indemnity', 'Warren'), ('Marion Edwards v. State Farm Insurance Company and "John Doe,"', ''), ('Snowdon v. Grillo', 'Snowdon'), ('Adam Lunsford v. Cravens Funeral Home', ''), ('State v. Dillon', 'Dillon'), ('In Re Graham', 'In Re Graham'), ('Durham v. Chrysler Corp.', ''), # Fails b/c Durham is a city! ('Carolyn Warrick v. Motiva Enterprises, L.L.C', ''), ('United States v. Aloi', 'Aloi'), ('United States Fidelity & Guaranty v. Graham', 'Graham'), ('Wildberger v. Rosenbaum', 'Wildberger'), ('Truck Insurance Exchange v. Michling', 'Michling'), ('Black Voters v. John J. McDonough', ''), ('State of Tennessee v. William F. Cain', ''), ('Robert J. Imbrogno v. Defense Logistics Agency', ''), ('Leetta Beachum, Administratrix v. Timothy Joseph White', ''), ('United States v. Jorge Gonzalez-Villegas', ''), ('Pitts v. Florida Bd. of Bar Examiners', 'Pitts'), ('State v. Pastushin', 'Pastushin'), ('Clark v. Clark', ''), ('Barrios v. Holder', 'Barrios'), ('Gregory L. Lavin v. United States', ''), ('Carpenter v. Consumers Power', 'Carpenter'), ('Derbabian v. S & C SNOWPLOWING, INC.', 'Derbabian'), ('Bright v. LSI CORP.', 'Bright'), ('State v. Brown', 'Brown'), ('KENNEY v. Keebler Co.', 'KENNEY'), ('Hill v. Chalanor', 'Hill'), ('Washington v. New Jersey', ''), ('Sollek v. Laseter', 'Sollek'), ('United States v. John Handy Jones, International Fidelity ' 'Insurance Company', ''), ('N.L.R.B. v. I. W. Corp', ''), ('Karpisek v. Cather & Sons Construction, Inc.', 'Karpisek'), ('Com. v. Wade', 'Com.'), ('Glascock v. Sukumlyn', 'Glascock'), ('Burroughs v. Hills', 'Burroughs'), ('State v. Darren Matthew Lee', ''), ('Mastondrea v. Occidental Hotels Management', 'Mastondrea'), ('Kent v. C. I. R', 'Kent'), ('Johnson v. City of Detroit', ''), ('Nolan v. United States', 'Nolan'), ('Currence v. Denver Tramway Corporation', 'Currence'), ('Matter of Cano', 'Matter of Cano'), # Two words after "Matter of --> Punt." ('Matter of Alphabet Soup', ''), # Zero words after "Matter of" --> Punt. ("Matter of", "Matter of"), ('Simmons v. Stalder', 'Simmons'), ('United States v. Donnell Hagood', ''), ('Kale v. United States INS', 'Kale'), ('Cmk v. Department of Revenue Ex Rel. Kb', 'Cmk'), ('State Farm Mut. Auto. Ins. Co. v. Barnes', 'Barnes'), ('In Re Krp', 'In Re Krp'), ('CH v. Department of Children and Families', 'CH'), ('Com. v. Monosky', 'Com.'), ('JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven', ''), ('Carolyn Humphrey v. Memorial Hospitals Association', ''), ('Wagner v. Sanders Associates, Inc.', 'Wagner'), ('United States v. Venie (Arthur G.)', ''), ('Mitchell v. State', ''), ('City of Biloxi, Miss. v. Giuffrida', 'Giuffrida'), ('Sexton v. St. Clair Federal Sav. Bank', 'Sexton'), ('United States v. Matthews', 'Matthews'), ('Freeman v. Freeman', 'Freeman'), ('Spencer v. Toussaint', 'Spencer'), ('In Re Canaday', 'In Re Canaday'), ('Wenger v. Commission on Judicial Performance', 'Wenger'), ('Jackson v. Janecka', 'Janecka'), ('People of Michigan v. Ryan Christopher Smith', ''), ('Kincade (Michael) v. State', ''), ('Tonubbee v. River Parishes Guide', 'Tonubbee'), ('United States v. Richiez', 'Richiez'), ('In Re Allamaras', 'In Re Allamaras'), ('United States v. Capoccia', 'Capoccia'), ('Com. v. DeFranco', 'Com.'), ('Matheny v. Porter', 'Matheny'), ('Piper v. Hoffman', 'Piper'), ('People v. Smith', ''), # Punted b/c People and Smith are bad. ('Mobuary, Joseph v. State.', ''), # Punted b/c "State." has punct ] tweaker = CaseNameTweaker() for t in test_pairs: output = tweaker.make_case_name_short(t[0]) self.assertEqual(output, t[1], "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" % (t[0], t[1], output))
def test_make_short_name(self): test_pairs = [ # In re and Matter of ("In re Lissner", "In re Lissner"), ("Matter of Lissner", "Matter of Lissner"), # Plaintiff is in bad word list ("State v. Lissner", "Lissner"), ("People v. Lissner", "Lissner"), ("California v. Lissner", "Lissner"), ("Dallas v. Lissner", "Lissner"), # Basic 3-word case ("Langley v. Google", "Langley"), # Similar to above, but more than 3 words ("Langley v. Google foo", "Langley"), # United States v. ... ("United States v. Lissner", "Lissner"), # Corporate first name ("Google, Inc. v. Langley", "Langley"), ("Special, LLC v. Langley", "Langley"), ("Google Corp. v. Langley", "Langley"), # Shorter appellant than plaintiff ("Michael Lissner v. Langley", "Langley"), # Multi-v with and w/o a bad_word ("Alameda v. Victor v. Keyboard", ""), ("Bloggers v. Victor v. Keyboard", ""), # Long left, short right ("Many words here v. Langley", "Langley"), # Other manually added items ("Ilarion v. State", "Ilarion"), ("Imery v. Vangil Ingenieros", "Imery"), # Many more tests from real data! ("Bean v. City of Monahans", "Bean"), ("Blanke v. Time, Inc.", "Blanke"), ("New York Life Ins. Co. v. Deshotel", "Deshotel"), ("Deatherage v. Deatherage", "Deatherage"), ("Gonzalez Vargas v. Holder", ""), ("Campbell v. Wainwright", "Campbell"), ("Liggett & Myers Tobacco Co. v. Finzer", "Finzer"), ("United States v. Brenes", "Brenes"), ("A.H. Robins Co., Inc. v. Eli Lilly & Co", ""), ("McKellar v. Hazen", "McKellar"), ("Gil v. State", "Gil"), ("Fuentes v. Owen", "Fuentes"), ("State v. Shearer", "Shearer"), ("United States v. Smither", "Smither"), ("People v. Bradbury", "Bradbury"), ("Venable (James) v. State", ""), ("Burkhardt v. Bailey", "Burkhardt"), ("DeLorenzo v. Bales", "DeLorenzo"), ("Loucks v. Bauman", "Loucks"), ("Kenneth Stern v. Robert Weinstein", ""), ("Rayner v. Secretary of Health and Human Services", "Rayner"), ("Rhyne v. Martin", "Rhyne"), ("State v. Wolverton", "Wolverton"), ("State v. Flood", "Flood"), ("Amason v. Natural Gas Pipeline Co.", "Amason"), ("United States v. Bryant", "Bryant"), ("WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA", ""), ("Stewart v. Tupperware Corp.", "Stewart"), ("Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY", ""), ("Stein v. State Tax Commission", "Stein"), ( "The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley", "", ), ("People v. Armstrong", "Armstrong"), ("Weeks v. Weeks", "Weeks"), ("Smith v. Xerox Corp.", ""), ("In Interest of Ad", ""), ("People v. Forsyth", "Forsyth"), ("State v. LeClair", "LeClair"), ("Agristor Credit Corp. v. Unruh", "Unruh"), ("United States v. Larry L. Stewart", ""), ("Starling v. United States", "Starling"), ("United States v. Pablo Colin-Molina", ""), ("Kenneth N. Juhl v. The United States", ""), ("Matter of Wilson", "Matter of Wilson"), ("In Re Damon H.", ""), ("Centennial Ins. Co. v. Zylberberg", "Zylberberg"), ("United States v. Donald Lee Stotler", ""), ("Byndloss v. State", "Byndloss"), ("People v. Piatkowski", "Piatkowski"), ("United States v. Willie James Morgan", ""), ("Harbison (Debra) v. Thieret (James)", ""), ("Federal Land Bank of Columbia v. Lieben", "Lieben"), ("John Willard Greywind v. John T. Podrebarac", ""), ("State v. Powell", "Powell"), ("Carr v. Galloway", "Carr"), ("Saylors v. State", "Saylors"), ("Jones v. Franke", "Jones"), ( "In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde " "Titleholders, Inc., a California Corporation", "", ), ( "Pollenex Corporation v. Sunbeam-Home Comfort, a Division of " "Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing " "Corporation of North America", "", ), ("Longs v. State", "Longs"), ("Performance Network Solutions v. Cyberklix", "Cyberklix"), ("DiSabatino v. Salicete", "DiSabatino"), ("State v. Jennifer Nicole Jackson", ""), ("United States v. Moreno", "Moreno"), ("LOGAN & KANAWHA COAL v. Banque Francaise", ""), ("State v. Harrison", "Harrison"), ("Efford v. Milam", "Efford"), ("People v. Thompson", "Thompson"), ("CINCINNATI THERMAL SPRAY v. Pender County", ""), ("JAH Ex Rel. RMH v. Wadle & Associates", ""), ("United Pub. Employees v. CITY & CTY. OF SAN FRAN.", ""), ("Warren v. Massachusetts Indemnity", "Warren"), ( 'Marion Edwards v. State Farm Insurance Company and "John Doe,"', "", ), ("Snowdon v. Grillo", "Snowdon"), ("Adam Lunsford v. Cravens Funeral Home", ""), ("State v. Dillon", "Dillon"), ("In Re Graham", "In Re Graham"), ("Durham v. Chrysler Corp.", ""), # Fails b/c Durham is a city! ("Carolyn Warrick v. Motiva Enterprises, L.L.C", ""), ("United States v. Aloi", "Aloi"), ("United States Fidelity & Guaranty v. Graham", "Graham"), ("Wildberger v. Rosenbaum", "Wildberger"), ("Truck Insurance Exchange v. Michling", "Michling"), ("Black Voters v. John J. McDonough", ""), ("State of Tennessee v. William F. Cain", ""), ("Robert J. Imbrogno v. Defense Logistics Agency", ""), ("Leetta Beachum, Administratrix v. Timothy Joseph White", ""), ("United States v. Jorge Gonzalez-Villegas", ""), ("Pitts v. Florida Bd. of Bar Examiners", "Pitts"), ("State v. Pastushin", "Pastushin"), ("Clark v. Clark", ""), ("Barrios v. Holder", "Barrios"), ("Gregory L. Lavin v. United States", ""), ("Carpenter v. Consumers Power", "Carpenter"), ("Derbabian v. S & C SNOWPLOWING, INC.", "Derbabian"), ("Bright v. LSI CORP.", "Bright"), ("State v. Brown", "Brown"), ("KENNEY v. Keebler Co.", "KENNEY"), ("Hill v. Chalanor", "Hill"), ("Washington v. New Jersey", ""), ("Sollek v. Laseter", "Sollek"), ( "United States v. John Handy Jones, International Fidelity " "Insurance Company", "", ), ("N.L.R.B. v. I. W. Corp", ""), ("Karpisek v. Cather & Sons Construction, Inc.", "Karpisek"), ("Com. v. Wade", "Com."), ("Glascock v. Sukumlyn", "Glascock"), ("Burroughs v. Hills", "Burroughs"), ("State v. Darren Matthew Lee", ""), ("Mastondrea v. Occidental Hotels Management", "Mastondrea"), ("Kent v. C. I. R", "Kent"), ("Johnson v. City of Detroit", ""), ("Nolan v. United States", "Nolan"), ("Currence v. Denver Tramway Corporation", "Currence"), ("Matter of Cano", "Matter of Cano"), # Two words after "Matter of --> Punt." ("Matter of Alphabet Soup", ""), # Zero words after "Matter of" --> Punt. ("Matter of", "Matter of"), ("Simmons v. Stalder", "Simmons"), ("United States v. Donnell Hagood", ""), ("Kale v. United States INS", "Kale"), ("Cmk v. Department of Revenue Ex Rel. Kb", "Cmk"), ("State Farm Mut. Auto. Ins. Co. v. Barnes", "Barnes"), ("In Re Krp", "In Re Krp"), ("CH v. Department of Children and Families", "CH"), ("Com. v. Monosky", "Com."), ("JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven", ""), ("Carolyn Humphrey v. Memorial Hospitals Association", ""), ("Wagner v. Sanders Associates, Inc.", "Wagner"), ("United States v. Venie (Arthur G.)", ""), ("Mitchell v. State", ""), ("City of Biloxi, Miss. v. Giuffrida", "Giuffrida"), ("Sexton v. St. Clair Federal Sav. Bank", "Sexton"), ("United States v. Matthews", "Matthews"), ("Freeman v. Freeman", "Freeman"), ("Spencer v. Toussaint", "Spencer"), ("In Re Canaday", "In Re Canaday"), ("Wenger v. Commission on Judicial Performance", "Wenger"), ("Jackson v. Janecka", "Janecka"), ("People of Michigan v. Ryan Christopher Smith", ""), ("Kincade (Michael) v. State", ""), ("Tonubbee v. River Parishes Guide", "Tonubbee"), ("United States v. Richiez", "Richiez"), ("In Re Allamaras", "In Re Allamaras"), ("United States v. Capoccia", "Capoccia"), ("Com. v. DeFranco", "Com."), ("Matheny v. Porter", "Matheny"), ("Piper v. Hoffman", "Piper"), ("People v. Smith", ""), # Punted b/c People and Smith are bad. ("Mobuary, Joseph v. State.", ""), # Punted b/c "State." has punct ] tweaker = CaseNameTweaker() for t in test_pairs: output = tweaker.make_case_name_short(t[0]) self.assertEqual( output, t[1], "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" % (t[0], t[1], output), )
class Command(VerboseCommand): help = 'Runs the Juriscraper toolkit against one or many jurisdictions.' def __init__(self, stdout=None, stderr=None, no_color=False): super(Command, self).__init__(stdout=None, stderr=None, no_color=False) self.cnt = CaseNameTweaker() def add_arguments(self, parser): parser.add_argument( '--daemon', action='store_true', default=False, help=('Use this flag to turn on daemon mode, in which all ' 'courts requested will be scraped in turn, ' 'nonstop, in a loop.'), ) parser.add_argument( '--rate', type=int, default=30, help=('The length of time in minutes it takes to crawl ' 'all requested courts. Particularly useful if it is ' 'desired to quickly scrape over all courts. Default ' 'is 30 minutes.'), ) parser.add_argument( '--courts', type=str, dest='court_id', metavar="COURTID", required=True, help=('The court(s) to scrape and extract. This should be ' 'in the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal_appellate.ca1" ' 'or simply "opinions" to do all opinions.'), ) parser.add_argument( '--fullcrawl', dest='full_crawl', action='store_true', default=False, help="Disable duplicate aborting.", ) def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error def save_everything(self, items, index=False, backscrape=False): """Saves all the sub items and associates them as appropriate. """ docket, cluster = items['docket'], items['cluster'] opinion, citations = items['opinion'], items['citations'] docket.save() cluster.docket = docket cluster.save(index=False) # Index only when the opinion is associated. for citation in citations: citation.cluster_id = cluster.pk citation.save() if cluster.judges: candidate_judges = get_candidate_judges( cluster.judges, docket.court.pk, cluster.date_filed, ) if len(candidate_judges) == 1: opinion.author = candidate_judges[0] if len(candidate_judges) > 1: for candidate in candidate_judges: cluster.panel.add(candidate) opinion.cluster = cluster opinion.save(index=index) if not backscrape: RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk) def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest() if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = {'lookup_value': item['download_urls'], 'lookup_by': 'download_url'} else: lookup_params = {'lookup_value': sha1_hash, 'lookup_by': 'sha1'} onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content ) if error: download_error = True continue self.save_everything( items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False ) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash) def parse_and_scrape_site(self, mod, full_crawl): site = mod.Site().parse() self.scrape_court(site, full_crawl) def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__( "%s.%s" % (package, module), globals(), locals(), [module] ) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception as e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) ErrorLog( log_level='CRITICAL', court=court, message=msg ).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and options['daemon']: # Start over... logger.info("All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)