def setUp(self): super(ReportScrapeStatusTest, self).setUp() self.court = Court.objects.get(pk="test") # Make some errors that we can tally ErrorLog(log_level="WARNING", court=self.court, message="test_msg").save() ErrorLog(log_level="CRITICAL", court=self.court, message="test_msg").save()
def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked is not None: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, federal_cite_one=item.get('west_citations', ''), state_cite_one=item.get('west_state_citations', ''), neutral_cite=item.get('neutral_citations', ''), ) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, error
def make_objects(self, item, court, sha1_hash, content): blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get( "case_name_shorts" ) or self.cnt.make_case_name_short(item["case_names"]) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item["case_dates"], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get("judges", ""), source="C", case_name=item["case_names"], case_name_short=case_name_short, sha1=sha1_hash, download_url=item["download_urls"], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in [".mp3", ".wma"]: extension = ( "." + item["download_urls"].lower().rsplit(".", 1)[1] ) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item["case_names"].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = ( "Unable to save binary to disk. Deleted audio file: %s.\n " "%s" % (item["case_names"], traceback.format_exc()) ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() error = True return docket, audio_file, error
def make_objects(self, item, court, sha1_hash, content): blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item['case_dates'], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get('judges', ''), source='C', case_name=item['case_names'], case_name_short=case_name_short, sha1=sha1_hash, download_url=item['download_urls'], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_names'].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \ '%s' % (item['case_names'], traceback.format_exc()) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, audio_file, error
def handle(self, *args, **options): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception, e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split( '_')[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() except Exception, e: # This is very important. Without this, an exception # above will crash the caller. pass
def scrape_court(self, site, full_crawl=False, backscrape=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, site._get_adapter_instance(), method=site.method, ) if msg: logger.warning(msg) ErrorLog( log_level="WARNING", court=court, message=msg ).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) onwards = dup_checker.press_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1", ) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info( "Adding new document found at: %s" % item["download_urls"].encode("utf-8") ) dup_checker.reset() docket, audio_file, error = make_objects( item, court, sha1_hash, content ) if error: download_error = True continue save_everything( items={"docket": docket, "audio_file": audio_file}, index=False, backscrape=backscrape, ) process_audio_file.apply_async( (audio_file.pk,), countdown=random.randint(0, 3600) ) logger.info( "Successfully added audio file {pk}: {name}".format( pk=audio_file.pk, name=item["case_names"].encode("utf-8"), ) ) # Update the hash if everything finishes properly. logger.info( "%s: Successfully crawled oral arguments." % site.court_id ) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content(item['download_urls'], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item['case_dates'] try: next_date = site[i + 1]['case_dates'] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if (court_str == 'nev' and item['precedential_statuses'] == 'Unpublished'): # Nevada's non-precedential cases have different SHA1 # sums every time. lookup_params = { 'lookup_value': item['download_urls'], 'lookup_by': 'download_url' } else: lookup_params = { 'lookup_value': sha1_hash, 'lookup_by': 'sha1' } onwards = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info('Adding new document found at: %s' % item['download_urls'].encode('utf-8')) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content) if error: download_error = True continue self.save_everything(items={ 'docket': docket, 'opinion': opinion, 'cluster': cluster, 'citations': citations, }, index=False) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item['case_names'].encode('utf-8'), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False, ocr_available=True): # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) if dup_checker.abort_by_url_hash(site.url, site.hash): return if site.cookies: logger.info(f"Using cookies: {site.cookies}") for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, method=site.method, ) if msg: logger.warning(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if ( court_str == "nev" and item["precedential_statuses"] == "Unpublished" ): # Nevada's non-precedential cases have different SHA1 sums # every time. lookup_params = { "lookup_value": item["download_urls"], "lookup_by": "download_url", } else: lookup_params = { "lookup_value": sha1_hash, "lookup_by": "sha1", } proceed = dup_checker.press_on( Opinion, current_date, next_date, **lookup_params ) if dup_checker.emulate_break: break if not proceed: continue # Not a duplicate, carry on logger.info( f"Adding new document found at: {item['download_urls'].encode()}" ) dup_checker.reset() docket, opinion, cluster, citations = make_objects( item, court, sha1_hash, content ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, ocr_available=ocr_available, citation_jitter=True ) logger.info( f"Successfully added doc {opinion.pk}: {item['case_names'].encode()}" ) # Update the hash if everything finishes properly. logger.info(f"{site.court_id}: Successfully crawled opinions.") if not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get( "case_name_shorts") or self.cnt.make_case_name_short( item["case_names"]) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get("west_citations", "") state_cite_str = item.get("west_state_citations", "") neutral_cite_str = item.get("neutral_citations", "") cluster = OpinionCluster( judges=item.get("judges", ""), date_filed=item["case_dates"], date_filed_is_approximate=item["date_filed_is_approximate"], case_name=item["case_names"], case_name_short=case_name_short, source="C", precedential_status=item["precedential_statuses"], nature_of_suit=item.get("nature_of_suit", ""), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get("summaries", ""), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type=Opinion.COMBINED, sha1=sha1_hash, download_url=item["download_urls"], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item["case_names"].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ("Unable to save binary to disk. Deleted " "item: %s.\n %s" % (item["case_names"], traceback.format_exc())) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error
def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options["court_id"]) if not len(module_strings): raise CommandError("Unable to import module or package. Aborting.") logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options["rate"] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options["full_crawl"]) except Exception as e: # noinspection PyBroadException try: msg = ("********!! CRAWLER DOWN !!***********\n" "*****scrape_court method failed!*****\n" "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split(".")[-1].split( "_")[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = i == (num_courts - 1) if last_court_in_list and options["daemon"]: # Start over... logger.info( "All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)
def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] if west_cite_str: citation_obj = get_citations(west_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.WEST, )) if state_cite_str: citation_obj = get_citations(state_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.STATE, )) if neutral_cite_str: citation_obj = get_citations(neutral_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.NEUTRAL, )) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error
try: _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError, e: print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \ (avconv_command, e.returncode, e.output) print traceback.format_exc() raise # Have to do this last because otherwise the mp3 hasn't yet been generated. set_mp3_meta_data(af, path_to_tmp_location) af.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) file_name = trunc(best_case_name(af).lower(), 72) + '_cl.mp3' af.file_with_date = af.docket.date_argued af.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (af.pk, traceback.format_exc()) print msg ErrorLog(log_level='CRITICAL', court=af.docket.court, message=msg).save() af.processing_complete = True af.save() os.remove(path_to_tmp_location)