def setUp(self): super(ReportScrapeStatusTest, self).setUp() # Make some errors that we can tally ErrorLog(log_level='WARNING', court=self.court, message="test_msg").save() ErrorLog(log_level='CRITICAL', court=self.court, message="test_msg").save()
def process_audio_file(pk): """Given the key to an audio file, extract its content and add the related meta data to the database. """ audio_file = Audio.objects.get(pk=pk) path_to_original = audio_file.local_path_original_file.path path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3') # Convert original file to: # - mono (-ac 1) # - sample rate (audio samples / s) of 22050Hz (-ar 22050) # - constant bit rate (sample resolution) of 48kbps (-ab 48k) avconv_command = [ 'avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab', '48k', path_to_tmp_location ] _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) # Have to do this last because otherwise the mp3 hasn't yet been generated. file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3' set_mp3_meta_data(audio_file, path_to_tmp_location) audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) audio_file.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (audio_file.pk, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=audio_file.docket.court, message=msg).save() audio_file.processing_complete = True audio_file.save()
def handle(self, *args, **options): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) self.verbosity = int(options.get('verbosity', 1)) daemon_mode = options.get('daemonmode', False) full_crawl = options.get('full_crawl', False) try: rate = int(options['rate']) except (ValueError, AttributeError, TypeError): rate = 30 court_id = options.get('court_id') if not court_id: raise CommandError( 'You must specify a court as a package or module.') else: module_strings = build_module_list(court_id) if not len(module_strings): raise CommandError( 'Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (rate * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, full_crawl) except Exception, e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' ) % traceback.format_exc() logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split( '_')[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() except Exception, e: # This is very important. Without this, an exception above will crash the caller. pass finally:
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data sha1_hash = hashlib.sha1(r.content).hexdigest() if court_str == 'nev' and site.precedential_statuses[ i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url') else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) random_delay = random.randint(0, 3600) extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay) logger.info("Successfully added doc {pk}: {name}".format( pk=doc.pk, name=site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
'48k', path_to_tmp_location ] try: _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError, e: print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \ (avconv_command, e.returncode, e.output) print traceback.format_exc() raise # Have to do this last because otherwise the mp3 hasn't yet been generated. set_mp3_meta_data(af, path_to_tmp_location) af.duration = eyed3.load(path_to_tmp_location).info.time_secs with open(path_to_tmp_location, 'r') as mp3: try: cf = ContentFile(mp3.read()) file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3' af.local_path_mp3.save(file_name, cf, save=False) except: msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \ "audio_file for item: %s\nTraceback:\n%s" % \ (af.pk, traceback.format_exc()) ErrorLog(log_level='CRITICAL', court=af.docket.court, message=msg).save() af.processing_complete = True af.save() os.remove(path_to_tmp_location)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None sha1_hash = hashlib.sha1(content).hexdigest() onwards = dup_checker.should_we_continue_break_or_carry_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() docket, audio_file = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + site.download_urls[i].rsplit( '.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue self.save_everything(docket, audio_file) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def setUp(self): court = Court.objects.get(pk='test') # Make some errors that we can tally ErrorLog(log_level='WARNING', court=court, message="test_msg").save() ErrorLog(log_level='CRITICAL', court=court, message="test_msg").save()