def __init__(self, cnt=None): super(AbstractSite, self).__init__() # Computed metadata self.hash = None self.html = None self.method = "GET" self.back_scrape_iterable = None self.downloader_executed = False self.cookies = {} self.cnt = cnt or CaseNameTweaker() self.request = { "verify": certifi.where(), "session": requests.session(), "headers": {"User-Agent": "Juriscraper"}, # Disable CDN caching on sites like SCOTUS (ahem) "cache-control": "no-cache, no-store, max-age=1", "parameters": {}, "request": None, "status": None, "url": None, } # Sub-classed metadata self.court_id = None self.url = None self.parameters = None self.uses_selenium = None self._opt_attrs = [] self._req_attrs = [] self._all_attrs = []
def __init__(self, cnt=None): super(AbstractSite, self).__init__() # Computed metadata self.hash = None self.html = None self.method = 'GET' self.back_scrape_iterable = None self.downloader_executed = False self.cookies = {} self.cnt = cnt or CaseNameTweaker() self.request = { 'verify': certifi.where(), 'session': requests.session(), 'headers': { 'User-Agent': 'Juriscraper' }, # Disable CDN caching on sites like SCOTUS (ahem) 'cache-control': 'no-cache, no-store, max-age=1', 'parameters': {}, 'request': None, 'status': None, 'url': None, } # Sub-classed metadata self.court_id = None self.url = None self.parameters = None self.uses_selenium = None self._opt_attrs = [] self._req_attrs = [] self._all_attrs = []
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options["queue"] index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( "Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name) )
def get_pdfs(options: OptionsType) -> None: """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = cast(str, options["queue"]) index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info(f"{task_name} {count} items from PACER.") throttle = CeleryThrottle(queue_name=q) completed = 0 for row in rows.iterator(): throttle.maybe_wait() c = chain( process_free_opinion_result.si( row.pk, row.court_id, cnt, ).set(queue=q), get_and_process_free_pdf.s(row.pk, row.court_id).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( f"Sent {completed}/{count} tasks to celery for {task_name} so far." )
def __init__(self, cnt=None): super(AbstractSite, self).__init__() # Computed metadata self.hash = None self.html = None self.method = 'GET' self.use_sessions = False self.status = None self.back_scrape_iterable = None self.cookies = {} self.cnt = cnt or CaseNameTweaker() # Sub-classed metadata self.court_id = None self.url = None self.parameters = None self.uses_selenium = None self._opt_attrs = [] self._req_attrs = [] self._all_attrs = []
def __init__(self, stdout=None, stderr=None, no_color=False): super(Command, self).__init__(stdout=None, stderr=None, no_color=False) self.cnt = CaseNameTweaker()
class Command(VerboseCommand): help = 'Runs the Juriscraper toolkit against one or many jurisdictions.' def __init__(self, stdout=None, stderr=None, no_color=False): super(Command, self).__init__(stdout=None, stderr=None, no_color=False) self.cnt = CaseNameTweaker() def add_arguments(self, parser): parser.add_argument( '--daemon', action='store_true', default=False, help=('Use this flag to turn on daemon mode, in which all ' 'courts requested will be scraped in turn, ' 'nonstop, in a loop.'), ) parser.add_argument( '--rate', type=int, default=30, help=('The length of time in minutes it takes to crawl ' 'all requested courts. Particularly useful if it is ' 'desired to quickly scrape over all courts. Default ' 'is 30 minutes.'), ) parser.add_argument( '--courts', type=str, dest='court_id', metavar="COURTID", required=True, help=('The court(s) to scrape and extract. This should be ' 'in the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal_appellate.ca1" ' 'or simply "opinions" to do all opinions.'), ) parser.add_argument( '--fullcrawl', dest='full_crawl', action='store_true', default=False, help="Disable duplicate aborting.", ) def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error def save_everything(self, items, index=False, backscrape=False): """Saves all the sub items and associates them as appropriate. """ docket, cluster = items['docket'], items['cluster'] opinion, citations = items['opinion'], items['citations'] docket.save() cluster.docket = docket cluster.save(index=False) # Index only when the opinion is associated. for citation in citations: citation.cluster_id = cluster.pk citation.save() if cluster.judges: candidate_judges = get_candidate_judges( cluster.judges, docket.court.pk, cluster.date_filed, ) if len(candidate_judges) == 1: opinion.author = candidate_judges[0] if len(candidate_judges) > 1: for candidate in candidate_judges: cluster.panel.add(candidate) opinion.cluster = cluster opinion.save(index=index) if not backscrape: RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk) def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content(item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = { 'lookup_value': item['download_urls'], 'lookup_by': 'download_url' } else: lookup_params = { 'lookup_value': sha1_hash, 'lookup_by': 'sha1' } onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content) if error: download_error = True continue self.save_everything(items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash) def parse_and_scrape_site(self, mod, full_crawl): site = mod.Site().parse() self.scrape_court(site, full_crawl) def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception as e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split( '_')[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and options['daemon']: # Start over... logger.info( "All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)
class PacerXMLParser(object): """A class to parse a PACER XML file""" cnt = CaseNameTweaker() def __init__(self, path): logger.info("Initializing parser for %s" % path) # High-level attributes self.path = path self.xml = self.get_xml_contents() self.case_details = self.get_case_details() self.document_list = self.get_document_list() self.party_list = self.get_party_list() self.document_count = self.get_document_count() # Docket attributes self.court = self.get_court() self.docket_number = self.get_str_from_node(self.case_details, 'docket_num') self.pacer_case_id = self.get_str_from_node(self.case_details, 'pacer_case_num') self.date_filed = self.get_datetime_from_node(self.case_details, 'date_case_filed', cast_to_date=True) self.date_terminated = self.get_datetime_from_node( self.case_details, 'date_case_terminated', cast_to_date=True) self.date_last_filing = self.get_datetime_from_node(self.case_details, 'date_last_filing', cast_to_date=True) self.case_name = harmonize( self.get_str_from_node(self.case_details, 'case_name')) self.case_name_short = self.cnt.make_case_name_short(self.case_name) self.cause = self.get_str_from_node(self.case_details, 'case_cause') self.nature_of_suit = self.get_str_from_node(self.case_details, 'nature_of_suit') self.jury_demand = self.get_str_from_node(self.case_details, 'jury_demand') self.jurisdiction_type = self.get_str_from_node( self.case_details, 'jurisdiction') self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to') self.referred_to, self.referred_to_str = self.get_judges('referred_to') self.blocked, self.date_blocked = get_blocked_status( self, self.document_count) # Non-parsed fields self.filepath_local = os.path.join('recap', self.path) self.filepath_ia = get_docketxml_url_from_path(self.path) def get_xml_contents(self): """Extract the XML from the file on disk and return it as an lxml tree """ xml_parser = etree.XMLParser(recover=True) tree = etree.parse(self.path, xml_parser) return tree def get_case_details(self): """Most of the details are in the case_details node, so set it aside for faster parsing. """ return self.xml.xpath('//case_details')[0] def get_document_list(self): """Get the XML nodes for the documents""" return self.xml.xpath('//document_list/document') def get_party_list(self): """Get the XML nodes for the parties""" return self.xml.xpath('//party_list/party') def get_document_count(self): """Get the number of documents associated with this docket.""" return len(self.document_list) def make_documents(self, docket, debug): """Parse through the document nodes, making good objects. For every node, create a line item on the Docket (a DocketEntry), and create 1..n additional RECAPDocuments (attachments or regular documents) that are associated with that DocketEntry. Returns None if an error occurs. """ recap_docs = [] for doc_node in self.document_list: # Make a DocketEntry object entry_number = doc_node.xpath('@doc_num')[0] attachment_number = int(doc_node.xpath('@attachment_num')[0]) logger.info("Working on document %s, attachment %s" % (entry_number, attachment_number)) if attachment_number == 0: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT try: docket_entry = DocketEntry.objects.get( docket=docket, entry_number=entry_number, ) except DocketEntry.DoesNotExist: if document_type == RECAPDocument.PACER_DOCUMENT: docket_entry = DocketEntry( docket=docket, entry_number=entry_number, ) else: logger.error("Tried to create attachment without a " "DocketEntry object to associate it with.") continue if document_type == RECAPDocument.PACER_DOCUMENT: date_filed = (self.get_datetime_from_node( doc_node, 'date_filed', cast_to_date=True) or docket_entry.date_filed) docket_entry.date_filed = date_filed docket_entry.description = (self.get_str_from_node( doc_node, 'long_desc') or docket_entry.description) try: if not debug: docket_entry.save() except (IntegrityError, DocketEntry.MultipleObjectsReturned): logger.error("Unable to create docket entry for docket " "#%s, on entry: %s." % (docket, entry_number)) continue recap_doc = self.make_recap_document( doc_node, docket_entry, entry_number, attachment_number, document_type, debug, ) if recap_doc is not None: recap_docs.append(recap_doc) return [item.pk for item in recap_docs] def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id') try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: rd = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, document_number=entry_number, ) else: rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') rd.document_type = document_type or rd.document_type # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') rd.is_available = False if availability is None else availability rd.sha1 = self.get_str_from_node(doc_node, 'sha1') rd.description = (self.get_str_from_node(doc_node, 'short_desc') or rd.description) if rd.is_available: rd.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) rd.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if rd.page_count is None: extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: rd.attachment_number = attachment_number if not debug: try: rd.save(do_extraction=False, index=False) except IntegrityError as e: # This happens when a pacer_doc_id has been wrongly set as # the document_number, see for example, document 19 and # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml logger.error( "Unable to create RECAPDocument for document #%s, " "attachment #%s on entry: %s due to " "IntegrityError." % (rd.document_number, rd.attachment_number, rd.docket_entry)) return None return rd @transaction.atomic def make_parties(self, docket, debug): """Pull out the parties and their attorneys and save them to the DB.""" atty_obj_cache = {} for party_node in self.party_list: party_name = self.get_str_from_node(party_node, 'name') party_type = self.get_str_from_node(party_node, 'type') party_type = normalize_party_types(party_type) party_extra_info = self.get_str_from_node(party_node, 'extra_info') logger.info("Working on party '%s' of type '%s'" % (party_name, party_type)) try: party = Party.objects.get(name=party_name) except Party.DoesNotExist: party = Party( name=party_name, extra_info=party_extra_info, ) if not debug: party.save() else: if party_extra_info and not debug: party.extra_info = party_extra_info party.save() # If the party type doesn't exist, make a new one. if not party.party_types.filter(docket=docket, name=party_type).exists(): pt = PartyType( docket=docket, party=party, name=party_type, ) if not debug: pt.save() self.add_attorneys(docket, party_node, party, atty_obj_cache, debug) def add_attorneys(self, docket, party_node, party, atty_obj_cache, debug): # Get the most recent date on the docket. We'll use this to have the # most updated attorney info. newest_docket_date = max([ d for d in [ docket.date_filed, docket.date_terminated, docket.date_last_filing ] if d ], ) atty_nodes = party_node.xpath('.//attorney_list/attorney') logger.info("Adding %s attorneys to the party." % len(atty_nodes)) for atty_node in atty_nodes: atty_name = self.get_str_from_node(atty_node, 'attorney_name') logger.info("Adding attorney: '%s'" % atty_name) atty_contact_raw = self.get_str_from_node(atty_node, 'contact') if 'see above' in atty_contact_raw.lower(): logger.info("Got 'see above' entry for atty_contact_raw.") atty_contact_raw = '' try: atty, atty_org_info, atty_info = atty_obj_cache[atty_name] except KeyError: logger.warn("Unable to lookup 'see above' entry. " "Creating/using atty with no contact info.") try: atty = Attorney.objects.get( name=atty_name, contact_raw=atty_contact_raw) except Attorney.DoesNotExist: atty = Attorney(name=atty_name, date_sourced=newest_docket_date, contact_raw=atty_contact_raw) if not debug: atty.save() else: # New attorney for this docket. Look them up in DB or create new # attorney if necessary. atty_org_info, atty_info = normalize_attorney_contact( atty_contact_raw, fallback_name=atty_name) try: logger.info("Didn't find attorney in cache, attempting " "lookup in the DB.") # Find an atty with the same name and one of another several # IDs. Important to add contact_raw here, b/c if it cannot # be parsed, all other values are blank. q = Q() fields = [ ('phone', atty_info['phone']), ('fax', atty_info['fax']), ('email', atty_info['email']), ('contact_raw', atty_contact_raw), ('organizations__lookup_key', atty_org_info.get('lookup_key')), ] for field, lookup in fields: if lookup: q |= Q(**{field: lookup}) atty = Attorney.objects.get(Q(name=atty_name) & q) except Attorney.DoesNotExist: logger.info("Unable to find matching attorney. Creating a " "new one: %s" % atty_name) atty = Attorney(name=atty_name, date_sourced=newest_docket_date, contact_raw=atty_contact_raw) if not debug: atty.save() except Attorney.MultipleObjectsReturned: logger.warn("Got too many results for attorney: '%s' " "Punting." % atty_name) continue # Cache the atty object and info for "See above" entries. atty_obj_cache[atty_name] = (atty, atty_org_info, atty_info) if atty_contact_raw: if atty_org_info: logger.info("Adding organization information to " "'%s': %s" % (atty_name, atty_org_info)) try: org = AttorneyOrganization.objects.get( lookup_key=atty_org_info['lookup_key'], ) except AttorneyOrganization.DoesNotExist: org = AttorneyOrganization(**atty_org_info) if not debug: org.save() # Add the attorney to the organization if not debug: AttorneyOrganizationAssociation.objects.get_or_create( attorney=atty, attorney_organization=org, docket=docket, ) atty_info_is_newer = (atty.date_sourced <= newest_docket_date) if atty_info and atty_info_is_newer: logger.info("Updating atty info because %s is more recent " "than %s." % (newest_docket_date, atty.date_sourced)) atty.date_sourced = newest_docket_date atty.contact_raw = atty_contact_raw atty.email = atty_info['email'] atty.phone = atty_info['phone'] atty.fax = atty_info['fax'] if not debug: atty.save() atty_role_str = self.get_str_from_node(atty_node, 'attorney_role') atty_roles = [ normalize_attorney_role(r) for r in atty_role_str.split('\n') if r ] atty_roles = [r for r in atty_roles if r['role'] is not None] atty_roles = remove_duplicate_dicts(atty_roles) if len(atty_roles) > 0: logger.info( "Linking attorney '%s' to party '%s' via %s " "roles: %s" % (atty_name, party.name, len(atty_roles), atty_roles)) else: logger.info("No role data parsed. Linking via 'UNKNOWN' role.") atty_roles = [{'role': Role.UNKNOWN, 'date_action': None}] if not debug: # Delete the old roles, replace with new. Role.objects.filter(attorney=atty, party=party, docket=docket).delete() Role.objects.bulk_create([ Role(attorney=atty, party=party, docket=docket, **atty_role) for atty_role in atty_roles ]) def get_court(self): """Extract the court from the XML and return it as a Court object""" court_str = self.case_details.xpath('court/text()')[0].strip() try: c = Court.objects.get(pk=map_pacer_to_cl_id(court_str)) except Court.DoesNotExist: raise ParsingException("Unable to identify court: %s" % court_str) else: return c @staticmethod def get_bool_from_node(node, path): try: s = node.xpath('%s/text()' % path)[0].strip() n = int(s) except IndexError: logger.debug("Couldn't get bool from path: %s" % path) return None except ValueError: logger.debug( "Couldn't convert text '%s' to int when making boolean " "for path: %s" % (s, path)) return None else: return bool(n) @staticmethod def get_str_from_node(node, path): try: s = node.xpath('%s/text()' % path)[0].strip() except IndexError: logger.debug("Couldn't get string from path: %s" % path) return '' # Return an empty string. Don't return None. else: return s def get_int_from_details(self, node): s = self.case_details.xpath('%s/text()' % node)[0].strip() try: return int(s) except ValueError: # Can't parse string to int logger.debug("Couldn't get int for node %s" % node) raise ParsingException("Cannot extract int for node %s" % node) @staticmethod def get_datetime_from_node(node, path, cast_to_date=False): """Parse a datetime from the XML located at node. If cast_to_date is true, the datetime object will be converted to a date. Else, will return a datetime object in parsed TZ if possible. Failing that, it will assume UTC. """ try: s = node.xpath('%s/text()' % path)[0].strip() except IndexError: logger.debug("Couldn't get date from path: %s" % path) return None else: try: d = parser.parse(s) except ValueError: logger.debug("Couldn't parse date: %s" % s) return None else: d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) # Set it to UTC. if cast_to_date is True: return d.date() return d def get_judges(self, node): """Parse out the judge string and then look it up in the DB""" try: s = self.case_details.xpath('%s/text()' % node)[0].strip() except IndexError: logger.info("Couldn't get judge for node: %s" % node) return None, '' else: judges = get_candidate_judges(s, self.court.pk, self.date_filed) if len(judges) == 0: return None, s elif len(judges) == 1: return judges[0], s else: return None, s
def test_make_short_name(self): test_pairs = [ # In re and Matter of ("In re Lissner", "In re Lissner"), ("Matter of Lissner", "Matter of Lissner"), # Plaintiff is in bad word list ("State v. Lissner", "Lissner"), ("People v. Lissner", "Lissner"), ("California v. Lissner", "Lissner"), ("Dallas v. Lissner", "Lissner"), # Basic 3-word case ("Langley v. Google", "Langley"), # Similar to above, but more than 3 words ("Langley v. Google foo", "Langley"), # United States v. ... ("United States v. Lissner", "Lissner"), # Corporate first name ("Google, Inc. v. Langley", "Langley"), ("Special, LLC v. Langley", "Langley"), ("Google Corp. v. Langley", "Langley"), # Shorter appellant than plaintiff ("Michael Lissner v. Langley", "Langley"), # Multi-v with and w/o a bad_word ("Alameda v. Victor v. Keyboard", ""), ("Bloggers v. Victor v. Keyboard", ""), # Long left, short right ("Many words here v. Langley", "Langley"), # Other manually added items ("Ilarion v. State", "Ilarion"), ("Imery v. Vangil Ingenieros", "Imery"), # Many more tests from real data! ("Bean v. City of Monahans", "Bean"), ("Blanke v. Time, Inc.", "Blanke"), ("New York Life Ins. Co. v. Deshotel", "Deshotel"), ("Deatherage v. Deatherage", "Deatherage"), ("Gonzalez Vargas v. Holder", ""), ("Campbell v. Wainwright", "Campbell"), ("Liggett & Myers Tobacco Co. v. Finzer", "Finzer"), ("United States v. Brenes", "Brenes"), ("A.H. Robins Co., Inc. v. Eli Lilly & Co", ""), ("McKellar v. Hazen", "McKellar"), ("Gil v. State", "Gil"), ("Fuentes v. Owen", "Fuentes"), ("State v. Shearer", "Shearer"), ("United States v. Smither", "Smither"), ("People v. Bradbury", "Bradbury"), ("Venable (James) v. State", ""), ("Burkhardt v. Bailey", "Burkhardt"), ("DeLorenzo v. Bales", "DeLorenzo"), ("Loucks v. Bauman", "Loucks"), ("Kenneth Stern v. Robert Weinstein", ""), ("Rayner v. Secretary of Health and Human Services", "Rayner"), ("Rhyne v. Martin", "Rhyne"), ("State v. Wolverton", "Wolverton"), ("State v. Flood", "Flood"), ("Amason v. Natural Gas Pipeline Co.", "Amason"), ("United States v. Bryant", "Bryant"), ("WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA", ""), ("Stewart v. Tupperware Corp.", "Stewart"), ("Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY", ""), ("Stein v. State Tax Commission", "Stein"), ( "The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley", "", ), ("People v. Armstrong", "Armstrong"), ("Weeks v. Weeks", "Weeks"), ("Smith v. Xerox Corp.", ""), ("In Interest of Ad", ""), ("People v. Forsyth", "Forsyth"), ("State v. LeClair", "LeClair"), ("Agristor Credit Corp. v. Unruh", "Unruh"), ("United States v. Larry L. Stewart", ""), ("Starling v. United States", "Starling"), ("United States v. Pablo Colin-Molina", ""), ("Kenneth N. Juhl v. The United States", ""), ("Matter of Wilson", "Matter of Wilson"), ("In Re Damon H.", ""), ("Centennial Ins. Co. v. Zylberberg", "Zylberberg"), ("United States v. Donald Lee Stotler", ""), ("Byndloss v. State", "Byndloss"), ("People v. Piatkowski", "Piatkowski"), ("United States v. Willie James Morgan", ""), ("Harbison (Debra) v. Thieret (James)", ""), ("Federal Land Bank of Columbia v. Lieben", "Lieben"), ("John Willard Greywind v. John T. Podrebarac", ""), ("State v. Powell", "Powell"), ("Carr v. Galloway", "Carr"), ("Saylors v. State", "Saylors"), ("Jones v. Franke", "Jones"), ( "In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde " "Titleholders, Inc., a California Corporation", "", ), ( "Pollenex Corporation v. Sunbeam-Home Comfort, a Division of " "Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing " "Corporation of North America", "", ), ("Longs v. State", "Longs"), ("Performance Network Solutions v. Cyberklix", "Cyberklix"), ("DiSabatino v. Salicete", "DiSabatino"), ("State v. Jennifer Nicole Jackson", ""), ("United States v. Moreno", "Moreno"), ("LOGAN & KANAWHA COAL v. Banque Francaise", ""), ("State v. Harrison", "Harrison"), ("Efford v. Milam", "Efford"), ("People v. Thompson", "Thompson"), ("CINCINNATI THERMAL SPRAY v. Pender County", ""), ("JAH Ex Rel. RMH v. Wadle & Associates", ""), ("United Pub. Employees v. CITY & CTY. OF SAN FRAN.", ""), ("Warren v. Massachusetts Indemnity", "Warren"), ( 'Marion Edwards v. State Farm Insurance Company and "John Doe,"', "", ), ("Snowdon v. Grillo", "Snowdon"), ("Adam Lunsford v. Cravens Funeral Home", ""), ("State v. Dillon", "Dillon"), ("In Re Graham", "In Re Graham"), ("Durham v. Chrysler Corp.", ""), # Fails b/c Durham is a city! ("Carolyn Warrick v. Motiva Enterprises, L.L.C", ""), ("United States v. Aloi", "Aloi"), ("United States Fidelity & Guaranty v. Graham", "Graham"), ("Wildberger v. Rosenbaum", "Wildberger"), ("Truck Insurance Exchange v. Michling", "Michling"), ("Black Voters v. John J. McDonough", ""), ("State of Tennessee v. William F. Cain", ""), ("Robert J. Imbrogno v. Defense Logistics Agency", ""), ("Leetta Beachum, Administratrix v. Timothy Joseph White", ""), ("United States v. Jorge Gonzalez-Villegas", ""), ("Pitts v. Florida Bd. of Bar Examiners", "Pitts"), ("State v. Pastushin", "Pastushin"), ("Clark v. Clark", ""), ("Barrios v. Holder", "Barrios"), ("Gregory L. Lavin v. United States", ""), ("Carpenter v. Consumers Power", "Carpenter"), ("Derbabian v. S & C SNOWPLOWING, INC.", "Derbabian"), ("Bright v. LSI CORP.", "Bright"), ("State v. Brown", "Brown"), ("KENNEY v. Keebler Co.", "KENNEY"), ("Hill v. Chalanor", "Hill"), ("Washington v. New Jersey", ""), ("Sollek v. Laseter", "Sollek"), ( "United States v. John Handy Jones, International Fidelity " "Insurance Company", "", ), ("N.L.R.B. v. I. W. Corp", ""), ("Karpisek v. Cather & Sons Construction, Inc.", "Karpisek"), ("Com. v. Wade", "Com."), ("Glascock v. Sukumlyn", "Glascock"), ("Burroughs v. Hills", "Burroughs"), ("State v. Darren Matthew Lee", ""), ("Mastondrea v. Occidental Hotels Management", "Mastondrea"), ("Kent v. C. I. R", "Kent"), ("Johnson v. City of Detroit", ""), ("Nolan v. United States", "Nolan"), ("Currence v. Denver Tramway Corporation", "Currence"), ("Matter of Cano", "Matter of Cano"), # Two words after "Matter of --> Punt." ("Matter of Alphabet Soup", ""), # Zero words after "Matter of" --> Punt. ("Matter of", "Matter of"), ("Simmons v. Stalder", "Simmons"), ("United States v. Donnell Hagood", ""), ("Kale v. United States INS", "Kale"), ("Cmk v. Department of Revenue Ex Rel. Kb", "Cmk"), ("State Farm Mut. Auto. Ins. Co. v. Barnes", "Barnes"), ("In Re Krp", "In Re Krp"), ("CH v. Department of Children and Families", "CH"), ("Com. v. Monosky", "Com."), ("JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven", ""), ("Carolyn Humphrey v. Memorial Hospitals Association", ""), ("Wagner v. Sanders Associates, Inc.", "Wagner"), ("United States v. Venie (Arthur G.)", ""), ("Mitchell v. State", ""), ("City of Biloxi, Miss. v. Giuffrida", "Giuffrida"), ("Sexton v. St. Clair Federal Sav. Bank", "Sexton"), ("United States v. Matthews", "Matthews"), ("Freeman v. Freeman", "Freeman"), ("Spencer v. Toussaint", "Spencer"), ("In Re Canaday", "In Re Canaday"), ("Wenger v. Commission on Judicial Performance", "Wenger"), ("Jackson v. Janecka", "Janecka"), ("People of Michigan v. Ryan Christopher Smith", ""), ("Kincade (Michael) v. State", ""), ("Tonubbee v. River Parishes Guide", "Tonubbee"), ("United States v. Richiez", "Richiez"), ("In Re Allamaras", "In Re Allamaras"), ("United States v. Capoccia", "Capoccia"), ("Com. v. DeFranco", "Com."), ("Matheny v. Porter", "Matheny"), ("Piper v. Hoffman", "Piper"), ("People v. Smith", ""), # Punted b/c People and Smith are bad. ("Mobuary, Joseph v. State.", ""), # Punted b/c "State." has punct ] tweaker = CaseNameTweaker() for t in test_pairs: output = tweaker.make_case_name_short(t[0]) self.assertEqual( output, t[1], "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" % (t[0], t[1], output), )
class Command(BaseCommand): help = 'Migrate all data for all apps from one DB to another.' case_name_tweaker = CaseNameTweaker() the_beginning_of_time = make_aware(datetime(1750, 1, 1), utc) def add_arguments(self, parser): parser.add_argument( '--search', action='store_true', default=False, help="Do migrations for the models in the search app: opinions, " "oral args, and dockets" ) parser.add_argument( '--citations', action='store_true', default=False, help="Do migrations for citations between objects" ) parser.add_argument( '--user-stuff', action='store_true', default=False, help="Do migrations for user-related stuff (bar memberships, " "alerts, favorites, donations, etc.)" ) parser.add_argument( '--stats', action='store_true', default=False, help="Do migrations for stats" ) def handle(self, *args, **options): if options['search']: self.migrate_opinions_oral_args_and_dockets() if options['citations']: self.migrate_intra_object_citations() if options['user_stuff']: self.migrate_users_profiles_alerts_favorites_and_donations() if options['stats']: self.migrate_stats() @staticmethod def _none_to_blank(value): """Normalizes a field to be u'' instead of None. This is needed b/c the old models erroneously had null=True on a number of text fields. If they were set up properly according to Django conventions, they'd disallow null and have been set to blank instead. """ if value is None: return u'' else: return value def _get_case_names(self, case_name_orig): case_name_len = len(case_name_orig) max_case_name_len = 150 if case_name_len > max_case_name_len: case_name = u'' case_name_full = case_name_orig else: case_name = case_name_orig case_name_full = u'' case_name_short = self.case_name_tweaker.make_case_name_short( case_name_orig) return case_name, case_name_full, case_name_short def _print_progress(self, progress, total, errors=None): """Print the progress of a migration subcomponent. If errors is provided it should be a dict of the form: errors = { 'KeyError': 1982, 'SomeOtherError': 42, } That is, error keys should be descriptive strings, and their values should be counts of how many times it happened. Note that using a collections.Counter object for this is very handy. """ if not errors: errors = {} self.stdout.write("\r\tMigrated %s of %s (%d%%). Skipped %s: (%s)." % ( progress, total, float(progress) / total * 100, sum(errors.values()), ', '.join(['%s: %s' % (k, v) for k, v in errors.items()]), ), ending='') self.stdout.flush() def migrate_opinions_oral_args_and_dockets(self): self.stdout.write("Migrating dockets, audio files, and opinions to new " "database...") q = DocketOld.objects.using('old').all() old_dockets = queryset_generator(q) num_dockets = q.count() progress = 0 self._print_progress(progress, num_dockets) for old_docket in old_dockets: # First do the docket, then create the cluster and opinion objects. try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is not None: old_citation = old_document.citation old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name) if old_audio is not None: old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name) court = CourtNew.objects.get(pk=old_docket.court_id) # Courts are in place thanks to initial data. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, case_name_short=old_doc_case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank(old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') if old_document is not None: new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_doc_case_name_short, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, federal_cite_one=self._none_to_blank( old_citation.federal_cite_one), federal_cite_two=self._none_to_blank( old_citation.federal_cite_two), federal_cite_three=self._none_to_blank( old_citation.federal_cite_three), state_cite_one=self._none_to_blank( old_citation.state_cite_one), state_cite_two=self._none_to_blank( old_citation.state_cite_two), state_cite_three=self._none_to_blank( old_citation.state_cite_three), state_cite_regional=self._none_to_blank( old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank( old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank( old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank( old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, ) if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio_case_name, case_name_short=old_audio_case_name_short, case_name_full=old_audio_case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio.local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, ) progress += 1 self._print_progress(progress, num_dockets) self.stdout.write(u'') # Newline def migrate_intra_object_citations(self): """This method migrates the citations from one database to the other so that we don't have to run the citation finding algorithm immediately after the migration. Recall that in the legacy schema, Documents have a One-2-Many relationship with Citations. This algo handles two kinds of citations. The first is the simple case (1 to 1): +--> C2--D2 / D1--cites-- \ +--> C3--D3 This is handled by making a new connection such that D1 cites D2 and D3: D1 --cites--> D2 and D1 --cites--> D3 The next kind of citation handled is more difficult. In this case, multiple Documents share a single Citation (1 to N). +--D2 | +--> C1--+ / | D1--cites-- +--D3 \ +--> C2--D4 This is handled by making the original document cite to all the targets: D1--cites-->D2 D1--cites-->D3 D1--cites-->D4 """ self.stdout.write("Migrating citation references to new database...") self.stdout.write("\tBuilding lookup dict of Citation IDs to " "Document IDs...") # Build lookup dict in memory to avoid DB hits in a moment citation_document_pairs = DocumentOld.objects.using( 'old' ).values_list( 'citation_id', 'pk' ) # This dict takes the form of: # { # citation_id: [ # document_id1, # document_id2, # ... # ], # ... # } # # The basic idea is that for any citation object's ID, you can lookup a # list of the documents that have it associated with them. cite_to_doc_dict = {} for citation_id, document_pk in citation_document_pairs: if citation_id in cite_to_doc_dict: cite_to_doc_dict[citation_id].append(document_pk) else: cite_to_doc_dict[citation_id] = [document_pk] # Iterate over all existing citations and move them to the correct place self.stdout.write( "\tBuilding list of all citations from Documents to Citations..." ) DocumentCitationsOld = DocumentOld.cases_cited.through all_citations = DocumentCitationsOld.objects.using('old') total_count = all_citations.count() citation_values = all_citations.values_list( 'document_id', 'citation_id' ) progress = 0 errors = Counter() starting_point = 14514268 # For use with failed scripts. self._print_progress(progress, total_count, errors) new_citations = [] for document_id, citation_id in citation_values: if progress < starting_point: errors.update(['AlreadyDone']) progress += 1 continue # Early abort if the Citation object has been deleted from the DB. try: cited_documents = cite_to_doc_dict[citation_id] except KeyError: errors.update(['KeyError:OrphanCitation']) continue for cited_document in cited_documents: new_citations.append( OpinionsCitedNew( citing_opinion_id=document_id, cited_opinion_id=cited_document, ) ) if len(new_citations) % 100 == 0: try: OpinionsCitedNew.objects.using( 'default' ).bulk_create( new_citations ) except IntegrityError: # Loop through each opinion and save it, marking the # failures. Could do this in the first place, but it's # slower. for new_citation in new_citations: try: new_citation.save() except IntegrityError: errors.update(['IntegrityError:CiteFromOrToMissingOpinionID']) continue new_citations = [] progress += 1 self._print_progress(progress, total_count, errors) # One final push if there's anything left. if len(new_citations) > 0: OpinionsCitedNew.objects.using('default').bulk_create(new_citations) self.stdout.write(u'') # Newline def migrate_users_profiles_alerts_favorites_and_donations(self): self.stdout.write("Migrating users, profiles, alerts, favorites, and " "donations to the new database...") old_users = User.objects.using('old').all() num_users = old_users.count() progress = 0 self._print_progress(progress, num_users) for old_user in old_users: old_profile = old_user.profile_legacy old_alerts = old_profile.alert.all() old_favorites = old_profile.favorite.all() old_donations = old_profile.donation.all() new_user = User( pk=old_user.pk, username=old_user.username, first_name=old_user.first_name, last_name=old_user.last_name, email=old_user.email, is_staff=old_user.is_staff, is_active=old_user.is_active, is_superuser=old_user.is_superuser, date_joined=old_user.date_joined, last_login=old_user.last_login, password=old_user.password, ) new_user.save(using='default') new_profile = UserProfileNew( pk=old_profile.pk, user=new_user, stub_account=old_profile.stub_account, employer=old_profile.employer, address1=old_profile.address1, address2=old_profile.address2, city=old_profile.city, state=old_profile.state, zip_code=old_profile.zip_code, avatar=old_profile.avatar, wants_newsletter=old_profile.wants_newsletter, plaintext_preferred=old_profile.plaintext_preferred, activation_key=old_profile.activation_key, key_expires=old_profile.key_expires, email_confirmed=old_profile.email_confirmed, ) new_profile.save(using='default') new_profile.barmembership.add( *[membership.pk for membership in old_profile.barmembership.all()] ) for old_alert in old_alerts: new_alert = AlertNew( pk=old_alert.pk, user=new_user, date_created=self.the_beginning_of_time, date_modified=self.the_beginning_of_time, name=old_alert.name, query=old_alert.query, rate=old_alert.rate, always_send_email=old_alert.always_send_email, date_last_hit=old_alert.date_last_hit, ) new_alert.save(using='default') for old_favorite in old_favorites: opinion_fave_pk = getattr(old_favorite.doc_id, 'pk', None) audio_fave_pk = getattr(old_favorite.audio_id, 'pk', None) if opinion_fave_pk is not None: cluster = OpinionClusterNew.objects.get( pk=opinion_fave_pk) audio = None else: cluster = None audio = AudioNew.objects.get(pk=audio_fave_pk) new_favorite = FavoriteNew( pk=old_favorite.pk, user=new_user, cluster_id=cluster, audio_id=audio, date_created=old_favorite.date_modified or now(), date_modified=old_favorite.date_modified or now(), name=old_favorite.name, notes=old_favorite.notes, ) new_favorite.save(using='default') for old_donation in old_donations: new_donation = DonationNew( pk=old_donation.pk, donor=new_user, date_modified=old_donation.date_modified, date_created=old_donation.date_created, clearing_date=old_donation.clearing_date, send_annual_reminder=old_donation.send_annual_reminder, amount=old_donation.amount, payment_provider=old_donation.payment_provider, payment_id=old_donation.payment_id, transaction_id=old_donation.transaction_id, status=old_donation.status, referrer=old_donation.referrer, ) new_donation.save(using='default') progress += 1 self._print_progress(progress, num_users) self.stdout.write(u'') # Do a newline... def migrate_stats(self): self.stdout.write("Migrating stats to the new database...") # Stats use the same model in new and old, with no db_table definitions. # Makes life oh-so-easy. old_stats = Stat.objects.using('old').all() stat_count = old_stats.count() progress = 0 self._print_progress(progress, stat_count) for old_stat in old_stats: old_stat.save(using='default') progress += 1 self._print_progress(progress, stat_count) self.stdout.write(u'') # Do a newline...
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) print "Testing {count} scrapers against their example files:".format( count=num_scrapers) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "juriscraper", "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = glob.glob('%s_example*' % example_path) self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() t2 = time.time() max_speed = 10 warn_speed = 1 speed = t2 - t1 msg = '' if speed > max_speed: if sys.gettrace() is None: # Only do this if we're not debugging. Debuggers make # things slower and breakpoints make things stop. raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print '(%s test(s) in %0.1f seconds%s)' % ( num_tests, speed, msg ) print ("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings, )) if num_warnings: print ("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print "\nNo speed warnings detected. That's great, keep up the " \ "good work!"
class PacerXMLParser(object): """A class to parse a PACER XML file""" cnt = CaseNameTweaker() def __init__(self, path): logger.info("Initializing parser for %s" % path) # High-level attributes self.path = path self.xml = self.get_xml_contents() self.case_details = self.get_case_details() self.document_list = self.get_document_list() self.document_count = self.get_document_count() # Docket attributes self.court = self.get_court() self.docket_number = self.get_str_from_node(self.case_details, 'docket_num') self.pacer_case_id = self.get_str_from_node(self.case_details, 'pacer_case_num') self.date_filed = self.get_datetime_from_node(self.case_details, 'date_case_filed', cast_to_date=True) self.date_terminated = self.get_datetime_from_node( self.case_details, 'date_case_terminated', cast_to_date=True) self.date_last_filing = self.get_datetime_from_node(self.case_details, 'date_last_filing', cast_to_date=True) self.case_name = harmonize( self.get_str_from_node(self.case_details, 'case_name')) self.case_name_short = self.cnt.make_case_name_short(self.case_name) self.cause = self.get_str_from_node(self.case_details, 'case_cause') self.nature_of_suit = self.get_str_from_node(self.case_details, 'nature_of_suit') self.jury_demand = self.get_str_from_node(self.case_details, 'jury_demand') self.jurisdiction_type = self.get_str_from_node( self.case_details, 'jurisdiction') self.assigned_to, self.assigned_to_str = self.get_judges('assigned_to') self.referred_to, self.referred_to_str = self.get_judges('referred_to') self.blocked, self.date_blocked = self.set_blocked_fields() # Non-parsed fields self.filepath_local = os.path.join('recap', self.path) self.filepath_ia = get_docketxml_url_from_path(self.path) def save(self, debug): """Save the item to the database, updating any existing items. Returns None if an error occurs. """ required_fields = ['case_name', 'date_filed'] for field in required_fields: if not getattr(self, field): print " Missing required field: %s" % field return None try: d = Docket.objects.get( Q(pacer_case_id=self.pacer_case_id) | Q(docket_number=self.docket_number), court=self.court, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket(source=Docket.RECAP) except Docket.MultipleObjectsReturned: print " Got multiple results while attempting save." return None for attr, v in self.__dict__.items(): setattr(d, attr, v) if not debug: d.save() print " Saved as Docket %s: https://www.courtlistener.com%s" % ( d.pk, d.get_absolute_url()) return d def get_xml_contents(self): """Extract the XML from the file on disk and return it as an lxml tree """ xml_parser = etree.XMLParser(recover=True) tree = etree.parse(self.path, xml_parser) return tree def get_case_details(self): """Most of the details are in the case_details node, so set it aside for faster parsing. """ return self.xml.xpath('//case_details')[0] def get_document_list(self): """Get the XML nodes for the documents""" return self.xml.xpath('//document_list/document') def get_document_count(self): """Get the number of documents associated with this docket.""" return len(self.document_list) def make_documents(self, docket, debug): """Parse through the document nodes, making good objects. For every node, create a line item on the Docket (a DocketEntry), and create 1..n additional RECAPDocuments (attachments or regular documents) that are associated with that DocketEntry. Returns None if an error occurs. """ recap_docs = [] for doc_node in self.document_list: # Make a DocketEntry object entry_number = doc_node.xpath('@doc_num')[0] attachment_number = int(doc_node.xpath('@attachment_num')[0]) print "Working on document %s, attachment %s" % (entry_number, attachment_number) if attachment_number == 0: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT try: docket_entry = DocketEntry.objects.get( docket=docket, entry_number=entry_number, ) except DocketEntry.DoesNotExist: if document_type == RECAPDocument.PACER_DOCUMENT: docket_entry = DocketEntry( docket=docket, entry_number=entry_number, ) else: logger.error( "Tried to create attachment without a DocketEntry " "object to associate it with.") continue if document_type == RECAPDocument.PACER_DOCUMENT: date_filed = (self.get_datetime_from_node( doc_node, 'date_filed', cast_to_date=True) or docket_entry.date_filed) docket_entry.date_filed = date_filed docket_entry.description = (self.get_str_from_node( doc_node, 'long_desc') or docket_entry.description) try: if not debug: docket_entry.save() except (IntegrityError, DocketEntry.MultipleObjectsReturned) as e: logger.error( "Unable to create docket entry for docket #%s, on " "entry: %s." % (docket, entry_number)) continue recap_doc = self.make_recap_document( doc_node, docket_entry, entry_number, attachment_number, document_type, debug, ) if recap_doc is not None: recap_docs.append(recap_doc) return [item.pk for item in recap_docs] def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id') try: d = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: d = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, ) else: d.pacer_doc_id = pacer_document_id or d.pacer_doc_id d.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') d.document_type = document_type or d.document_type d.document_number = entry_number # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') d.is_available = False if availability is None else availability d.sha1 = self.get_str_from_node(doc_node, 'sha1') d.description = (self.get_str_from_node(doc_node, 'short_desc') or d.description) if d.is_available: d.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) d.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if d.page_count is None: extension = d.filepath_local.path.split('.')[-1] d.page_count = get_page_count(d.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: d.attachment_number = attachment_number if not debug: try: d.save(do_extraction=False, index=False) except IntegrityError as e: # This happens when a pacer_doc_id has been wrongly set as # the document_number, see for example, document 19 and # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml logger.error( "Unable to create RECAPDocument for document #%s, " "attachment #%s on entry: %s due to " "IntegrityError." % (d.document_number, d.attachment_number, d.docket_entry)) return None return d def get_court(self): """Extract the court from the XML and return it as a Court object""" court_str = self.case_details.xpath('court/text()')[0].strip() try: c = Court.objects.get(pk=pacer_to_cl_ids.get(court_str, court_str)) except Court.DoesNotExist: raise ParsingException("Unable to identify court: %s" % court_str) else: return c @staticmethod def get_bool_from_node(node, path): try: s = node.xpath('%s/text()' % path)[0].strip() n = int(s) except IndexError: print " Couldn't get bool from path: %s" % path return None except ValueError: print( " Couldn't convert text '%s' to int when making boolean " "for path: %s" % (s, path)) return None else: return bool(n) @staticmethod def get_str_from_node(node, path): try: s = node.xpath('%s/text()' % path)[0].strip() except IndexError: print " Couldn't get string from path: %s" % path return '' # Return an empty string. Don't return None. else: return s def get_int_from_details(self, node): s = self.case_details.xpath('%s/text()' % node)[0].strip() try: return int(s) except ValueError: # Can't parse string to int print " Couldn't get int for node %s" % node raise ParsingException("Cannot extract int for node %s" % node) @staticmethod def get_datetime_from_node(node, path, cast_to_date=False): """Parse a datetime from the XML located at node.""" try: s = node.xpath('%s/text()' % path)[0].strip() except IndexError: print " Couldn't get date from path: %s" % path return None else: try: d = parser.parse(s) except ValueError: print " Couldn't parse date: %s" % s return None else: d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) # Set it to UTC. if cast_to_date is True: return d.date() return d def get_judges(self, node): """Parse out the judge string and then look it up in the DB""" try: s = self.case_details.xpath('%s/text()' % node)[0].strip() except IndexError: print " Couldn't get judge for node: %s" % node return None, '' else: judge_names = find_judge_names(s) judges = [] for judge_name in judge_names: judges.append( find_person(judge_name, self.court.pk, case_date=self.date_filed)) judges = [c for c in judges if c is not None] if len(judges) == 0: print " No judges found after lookup." logger.info("No judge for: %s" % ((s, self.court.pk, self.date_filed), )) return None, s elif len(judges) == 1: return judges[0], s elif len(judges) > 1: print " Too many judges found: %s" % len(judges) return None, s def set_blocked_fields(self): """Set the blocked status for the Docket. Dockets are public (blocked is False) when: Is Bankr. Court +---------+--------+ | YES | NO | +---------------+---------+--------+ Size | > 500 items | X | X | of +---------------+---------+--------+ Docket | <= 500 items | | X | +---------------+---------+--------+ """ bankruptcy_privacy_threshold = 500 small_case = self.document_count <= bankruptcy_privacy_threshold if all([small_case, self.court.is_bankruptcy]): return True, date.today() return False, None
class Command(VerboseCommand): help = 'Runs the Juriscraper toolkit against one or many jurisdictions.' def __init__(self, stdout=None, stderr=None, no_color=False): super(Command, self).__init__(stdout=None, stderr=None, no_color=False) self.cnt = CaseNameTweaker() def add_arguments(self, parser): parser.add_argument( '--daemon', action='store_true', default=False, help=('Use this flag to turn on daemon mode, in which all ' 'courts requested will be scraped in turn, ' 'nonstop, in a loop.'), ) parser.add_argument( '--rate', type=int, default=30, help=('The length of time in minutes it takes to crawl ' 'all requested courts. Particularly useful if it is ' 'desired to quickly scrape over all courts. Default ' 'is 30 minutes.'), ) parser.add_argument( '--courts', type=str, dest='court_id', metavar="COURTID", required=True, help=('The court(s) to scrape and extract. This should be ' 'in the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal_appellate.ca1" ' 'or simply "opinions" to do all opinions.'), ) parser.add_argument( '--fullcrawl', dest='full_crawl', action='store_true', default=False, help="Disable duplicate aborting.", ) def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error def save_everything(self, items, index=False, backscrape=False): """Saves all the sub items and associates them as appropriate. """ docket, cluster = items['docket'], items['cluster'] opinion, citations = items['opinion'], items['citations'] docket.save() cluster.docket = docket cluster.save(index=False) # Index only when the opinion is associated. for citation in citations: citation.cluster_id = cluster.pk citation.save() if cluster.judges: candidate_judges = get_candidate_judges( cluster.judges, docket.court.pk, cluster.date_filed, ) if len(candidate_judges) == 1: opinion.author = candidate_judges[0] if len(candidate_judges) > 1: for candidate in candidate_judges: cluster.panel.add(candidate) opinion.cluster = cluster opinion.save(index=index) if not backscrape: RealTimeQueue.objects.create(item_type='o', item_pk=opinion.pk) def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = hashlib.sha1(force_bytes(content)).hexdigest() if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = {'lookup_value': item['download_urls'], 'lookup_by': 'download_url'} else: lookup_params = {'lookup_value': sha1_hash, 'lookup_by': 'sha1'} onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content ) if error: download_error = True continue self.save_everything( items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False ) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash) def parse_and_scrape_site(self, mod, full_crawl): site = mod.Site().parse() self.scrape_court(site, full_crawl) def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__( "%s.%s" % (package, module), globals(), locals(), [module] ) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception as e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) ErrorLog( log_level='CRITICAL', court=court, message=msg ).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and options['daemon']: # Start over... logger.info("All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) msg = "Testing {count} scrapers against their example files:" print(msg.format(count=num_scrapers)) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = json.loads(site.to_json(), encoding='utf-8') if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] with open(json_path, 'r') as input_file: fixture_json = json.load(input_file) self.assertEqual( len(fixture_json), len(json_data), msg="Fixture and scraped data have different " "lengths: expected %s and scraped %s (%s)" % ( len(fixture_json), len(json_data), module_string ) ) for i, item in enumerate(fixture_json): self.assertEqual( fixture_json[i], json_data[i], ) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. with open(json_path, 'w') as json_example: json.dump(json_data, json_example, indent=2) t2 = time.time() max_speed = 15 warn_speed = 1 speed = t2 - t1 msg = '' if speed > max_speed: if sys.gettrace() is None and not IS_TRAVIS: # Only do this if we're not debugging. Debuggers make # things slower and breakpoints make things stop. raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
# Functions to parse court data in XML format into a list of dictionaries. import os import re import xml.etree.cElementTree as ET import dateutil.parser as dparser from juriscraper.lib.string_utils import titlecase, harmonize, clean_string, CaseNameTweaker from lxml import etree from cl.corpus_importer.court_regexes import state_pairs from cl.lib.crypto import sha1_of_file from parse_judges import find_judge_names from regexes_columbia import SPECIAL_REGEXES, FOLDER_DICT # initialized once since it takes resources CASE_NAME_TWEAKER = CaseNameTweaker() # tags for which content will be condensed into plain text SIMPLE_TAGS = [ "reporter_caption", "citation", "caption", "court", "docket", "posture", "date", "hearing_date", "panel", "attorneys" ] # regex that will be applied when condensing SIMPLE_TAGS content STRIP_REGEX = [r'</?citation.*>', r'</?page_number.*>'] # types of opinions that will be parsed # each may have a '_byline' and '_text' node OPINION_TYPES = ['opinion', 'dissent', 'concurrence']
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' json_compare_files_generated = [] for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.enable_test_mode() site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = json.loads(site.to_json(), encoding='utf-8') if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] with open(json_path, 'r') as input_file: fixture_json = json.load(input_file) self.assertEqual( len(fixture_json), len(json_data), msg="Fixture and scraped data have different " "lengths: expected %s and scraped %s (%s)" % ( len(fixture_json), len(json_data), module_string ) ) for i, item in enumerate(fixture_json): self.assertEqual( fixture_json[i], json_data[i], ) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. warn_generated_compare_file(json_path) json_compare_files_generated.append(json_path) with open(json_path, 'w') as json_example: json.dump(json_data, json_example, indent=2) t2 = time.time() duration = t2 - t1 warning_msg = warn_or_crash_slow_parser(t2 - t1) if warning_msg: num_warnings += 1 print('(%s test(s) in %0.1f seconds)' % (num_tests, duration)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if json_compare_files_generated: msg = 'Generated compare file(s) during test, please review before proceeding. ' \ 'If the data looks good, run tests again, then be sure to include ' \ 'the new compare file(s) in your commit: %s' self.fail(msg % ', '.join(json_compare_files_generated)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
) from cl.recap.models import ( FjcIntegratedDatabase, PacerFetchQueue, PacerHtmlFiles, ProcessingQueue, PROCESSING_STATUS, REQUEST_TYPE, UPLOAD_TYPE, ) from cl.scrapers.tasks import extract_recap_pdf, get_page_count from cl.search.models import Docket, DocketEntry, RECAPDocument from cl.search.tasks import add_or_update_recap_docket, add_items_to_solr logger = logging.getLogger(__name__) cnt = CaseNameTweaker() def process_recap_upload(pq): """Process an item uploaded from an extension or API user. Uploaded objects can take a variety of forms, and we'll need to process them accordingly. """ if pq.upload_type == UPLOAD_TYPE.DOCKET: chain( process_recap_docket.s(pq.pk), add_or_update_recap_docket.s() ).apply_async() elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE: process_recap_attachment.delay(pq.pk) elif pq.upload_type == UPLOAD_TYPE.PDF:
def test_make_short_name(self): test_pairs = [ # In re and Matter of ('In re Lissner', 'In re Lissner'), ('Matter of Lissner', 'Matter of Lissner'), # Plaintiff is in bad word list ('State v. Lissner', 'Lissner'), ('People v. Lissner', 'Lissner'), ('California v. Lissner', 'Lissner'), ('Dallas v. Lissner', 'Lissner'), # Basic 3-word case ('Langley v. Google', 'Langley'), # Similar to above, but more than 3 words ('Langley v. Google foo', 'Langley'), # United States v. ... ('United States v. Lissner', 'Lissner'), # Corporate first name ('Google, Inc. v. Langley', 'Langley'), ('Special, LLC v. Langley', 'Langley'), ('Google Corp. v. Langley', 'Langley'), # Shorter appellant than plaintiff ('Michael Lissner v. Langley', 'Langley'), # Multi-v with and w/o a bad_word ('Alameda v. Victor v. Keyboard', ''), ('Bloggers v. Victor v. Keyboard', ''), # Long left, short right ('Many words here v. Langley', 'Langley'), # Other manually added items ('Ilarion v. State', 'Ilarion'), ('Imery v. Vangil Ingenieros', 'Imery'), # Many more tests from real data! ('Bean v. City of Monahans', 'Bean'), ('Blanke v. Time, Inc.', 'Blanke'), ('New York Life Ins. Co. v. Deshotel', 'Deshotel'), ('Deatherage v. Deatherage', 'Deatherage'), ('Gonzalez Vargas v. Holder', ''), ('Campbell v. Wainwright', 'Campbell'), ('Liggett & Myers Tobacco Co. v. Finzer', 'Finzer'), ('United States v. Brenes', 'Brenes'), ('A.H. Robins Co., Inc. v. Eli Lilly & Co', ''), ('McKellar v. Hazen', 'McKellar'), ('Gil v. State', 'Gil'), ('Fuentes v. Owen', 'Fuentes'), ('State v. Shearer', 'Shearer'), ('United States v. Smither', 'Smither'), ('People v. Bradbury', 'Bradbury'), ('Venable (James) v. State', ''), ('Burkhardt v. Bailey', 'Burkhardt'), ('DeLorenzo v. Bales', 'DeLorenzo'), ('Loucks v. Bauman', 'Loucks'), ('Kenneth Stern v. Robert Weinstein', ''), ('Rayner v. Secretary of Health and Human Services', 'Rayner'), ('Rhyne v. Martin', 'Rhyne'), ('State v. Wolverton', 'Wolverton'), ('State v. Flood', 'Flood'), ('Amason v. Natural Gas Pipeline Co.', 'Amason'), ('United States v. Bryant', 'Bryant'), ('WELLS FARGO BANK v. APACHE TRIBE OF OKLAHOMA', ''), ('Stewart v. Tupperware Corp.', 'Stewart'), ('Society of New York Hosp. v. ASSOCIATED HOSP. SERV. OF NY', ''), ('Stein v. State Tax Commission', 'Stein'), ( 'The Putnam Pit, Inc. Geoffrey Davidian v. City of Cookeville, Tennessee Jim Shipley', ''), ('People v. Armstrong', 'Armstrong'), ('Weeks v. Weeks', 'Weeks'), ('Smith v. Xerox Corp.', ''), ('In Interest of Ad', ''), ('People v. Forsyth', 'Forsyth'), ('State v. LeClair', 'LeClair'), ('Agristor Credit Corp. v. Unruh', 'Unruh'), ('United States v. Larry L. Stewart', ''), ('Starling v. United States', 'Starling'), ('United States v. Pablo Colin-Molina', ''), ('Kenneth N. Juhl v. The United States', ''), ('Matter of Wilson', 'Matter of Wilson'), ('In Re Damon H.', ''), ('Centennial Ins. Co. v. Zylberberg', 'Zylberberg'), ('United States v. Donald Lee Stotler', ''), ('Byndloss v. State', 'Byndloss'), ('People v. Piatkowski', 'Piatkowski'), ('United States v. Willie James Morgan', ''), ('Harbison (Debra) v. Thieret (James)', ''), ('Federal Land Bank of Columbia v. Lieben', 'Lieben'), ('John Willard Greywind v. John T. Podrebarac', ''), ('State v. Powell', 'Powell'), ('Carr v. Galloway', 'Carr'), ('Saylors v. State', 'Saylors'), ('Jones v. Franke', 'Jones'), ('In Re Robert L. Mills, Debtor. Robert L. Mills v. Sdrawde ' 'Titleholders, Inc., a California Corporation', ''), ('Pollenex Corporation v. Sunbeam-Home Comfort, a Division of ' 'Sunbeam Corp., Raymond Industrial, Limited and Raymond Marketing ' 'Corporation of North America', ''), ('Longs v. State', 'Longs'), ('Performance Network Solutions v. Cyberklix', 'Cyberklix'), ('DiSabatino v. Salicete', 'DiSabatino'), ('State v. Jennifer Nicole Jackson', ''), ('United States v. Moreno', 'Moreno'), ('LOGAN & KANAWHA COAL v. Banque Francaise', ''), ('State v. Harrison', 'Harrison'), ('Efford v. Milam', 'Efford'), ('People v. Thompson', 'Thompson'), ('CINCINNATI THERMAL SPRAY v. Pender County', ''), ('JAH Ex Rel. RMH v. Wadle & Associates', ''), ('United Pub. Employees v. CITY & CTY. OF SAN FRAN.', ''), ('Warren v. Massachusetts Indemnity', 'Warren'), ('Marion Edwards v. State Farm Insurance Company and "John Doe,"', ''), ('Snowdon v. Grillo', 'Snowdon'), ('Adam Lunsford v. Cravens Funeral Home', ''), ('State v. Dillon', 'Dillon'), ('In Re Graham', 'In Re Graham'), ('Durham v. Chrysler Corp.', ''), # Fails b/c Durham is a city! ('Carolyn Warrick v. Motiva Enterprises, L.L.C', ''), ('United States v. Aloi', 'Aloi'), ('United States Fidelity & Guaranty v. Graham', 'Graham'), ('Wildberger v. Rosenbaum', 'Wildberger'), ('Truck Insurance Exchange v. Michling', 'Michling'), ('Black Voters v. John J. McDonough', ''), ('State of Tennessee v. William F. Cain', ''), ('Robert J. Imbrogno v. Defense Logistics Agency', ''), ('Leetta Beachum, Administratrix v. Timothy Joseph White', ''), ('United States v. Jorge Gonzalez-Villegas', ''), ('Pitts v. Florida Bd. of Bar Examiners', 'Pitts'), ('State v. Pastushin', 'Pastushin'), ('Clark v. Clark', ''), ('Barrios v. Holder', 'Barrios'), ('Gregory L. Lavin v. United States', ''), ('Carpenter v. Consumers Power', 'Carpenter'), ('Derbabian v. S & C SNOWPLOWING, INC.', 'Derbabian'), ('Bright v. LSI CORP.', 'Bright'), ('State v. Brown', 'Brown'), ('KENNEY v. Keebler Co.', 'KENNEY'), ('Hill v. Chalanor', 'Hill'), ('Washington v. New Jersey', ''), ('Sollek v. Laseter', 'Sollek'), ('United States v. John Handy Jones, International Fidelity ' 'Insurance Company', ''), ('N.L.R.B. v. I. W. Corp', ''), ('Karpisek v. Cather & Sons Construction, Inc.', 'Karpisek'), ('Com. v. Wade', 'Com.'), ('Glascock v. Sukumlyn', 'Glascock'), ('Burroughs v. Hills', 'Burroughs'), ('State v. Darren Matthew Lee', ''), ('Mastondrea v. Occidental Hotels Management', 'Mastondrea'), ('Kent v. C. I. R', 'Kent'), ('Johnson v. City of Detroit', ''), ('Nolan v. United States', 'Nolan'), ('Currence v. Denver Tramway Corporation', 'Currence'), ('Matter of Cano', 'Matter of Cano'), # Two words after "Matter of --> Punt." ('Matter of Alphabet Soup', ''), # Zero words after "Matter of" --> Punt. ("Matter of", "Matter of"), ('Simmons v. Stalder', 'Simmons'), ('United States v. Donnell Hagood', ''), ('Kale v. United States INS', 'Kale'), ('Cmk v. Department of Revenue Ex Rel. Kb', 'Cmk'), ('State Farm Mut. Auto. Ins. Co. v. Barnes', 'Barnes'), ('In Re Krp', 'In Re Krp'), ('CH v. Department of Children and Families', 'CH'), ('Com. v. Monosky', 'Com.'), ('JITNEY-JUNGLE, INCORPORATED v. City of Brookhaven', ''), ('Carolyn Humphrey v. Memorial Hospitals Association', ''), ('Wagner v. Sanders Associates, Inc.', 'Wagner'), ('United States v. Venie (Arthur G.)', ''), ('Mitchell v. State', ''), ('City of Biloxi, Miss. v. Giuffrida', 'Giuffrida'), ('Sexton v. St. Clair Federal Sav. Bank', 'Sexton'), ('United States v. Matthews', 'Matthews'), ('Freeman v. Freeman', 'Freeman'), ('Spencer v. Toussaint', 'Spencer'), ('In Re Canaday', 'In Re Canaday'), ('Wenger v. Commission on Judicial Performance', 'Wenger'), ('Jackson v. Janecka', 'Janecka'), ('People of Michigan v. Ryan Christopher Smith', ''), ('Kincade (Michael) v. State', ''), ('Tonubbee v. River Parishes Guide', 'Tonubbee'), ('United States v. Richiez', 'Richiez'), ('In Re Allamaras', 'In Re Allamaras'), ('United States v. Capoccia', 'Capoccia'), ('Com. v. DeFranco', 'Com.'), ('Matheny v. Porter', 'Matheny'), ('Piper v. Hoffman', 'Piper'), ('People v. Smith', ''), # Punted b/c People and Smith are bad. ('Mobuary, Joseph v. State.', ''), # Punted b/c "State." has punct ] tweaker = CaseNameTweaker() for t in test_pairs: output = tweaker.make_case_name_short(t[0]) self.assertEqual(output, t[1], "Input was:\n\t%s\n\n\tExpected: '%s'\n\tActual: '%s'" % (t[0], t[1], output))