示例#1
0
    def setUp(self):
        super(ReportScrapeStatusTest, self).setUp()

        # Make some errors that we can tally
        ErrorLog(log_level='WARNING', court=self.court,
                 message="test_msg").save()
        ErrorLog(log_level='CRITICAL', court=self.court,
                 message="test_msg").save()
示例#2
0
def process_audio_file(pk):
    """Given the key to an audio file, extract its content and add the related
    meta data to the database.
    """
    audio_file = Audio.objects.get(pk=pk)
    path_to_original = audio_file.local_path_original_file.path

    path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3')

    # Convert original file to:
    #  - mono (-ac 1)
    #  - sample rate (audio samples / s) of 22050Hz (-ar 22050)
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = [
        'avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab',
        '48k', path_to_tmp_location
    ]
    _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            logger.critical(msg)
            ErrorLog(log_level='CRITICAL',
                     court=audio_file.docket.court,
                     message=msg).save()

    audio_file.processing_complete = True
    audio_file.save()
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        self.verbosity = int(options.get('verbosity', 1))
        daemon_mode = options.get('daemonmode', False)

        full_crawl = options.get('full_crawl', False)

        try:
            rate = int(options['rate'])
        except (ValueError, AttributeError, TypeError):
            rate = 30

        court_id = options.get('court_id')
        if not court_id:
            raise CommandError(
                'You must specify a court as a package or module.')
        else:
            module_strings = build_module_list(court_id)
            if not len(module_strings):
                raise CommandError(
                    'Unable to import module or package. Aborting.')

            logger.info("Starting up the scraper.")
            num_courts = len(module_strings)
            wait = (rate * 60) / num_courts
            i = 0
            while i < num_courts:
                # this catches SIGTERM, so the code can be killed safely.
                if die_now:
                    logger.info("The scraper has stopped.")
                    sys.exit(1)

                package, module = module_strings[i].rsplit('.', 1)

                mod = __import__("%s.%s" % (package, module), globals(),
                                 locals(), [module])
                # noinspection PyBroadException
                try:
                    self.parse_and_scrape_site(mod, full_crawl)
                except Exception, e:
                    # noinspection PyBroadException
                    try:
                        msg = ('********!! CRAWLER DOWN !!***********\n'
                               '*****scrape_court method failed!*****\n'
                               '********!! ACTION NEEDED !!**********\n%s'
                               ) % traceback.format_exc()
                        logger.critical(msg)

                        # opinions.united_states.federal.ca9_u --> ca9
                        court_str = mod.Site.__module__.split('.')[-1].split(
                            '_')[0]
                        court = Court.objects.get(pk=court_str)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                    except Exception, e:
                        # This is very important. Without this, an exception above will crash the caller.
                        pass
                finally:
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                sha1_hash = hashlib.sha1(r.content).hexdigest()
                if court_str == 'nev' and site.precedential_statuses[
                        i] == 'Unpublished':
                    # Nevada's non-precedential cases have different SHA1 sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=site.download_urls[i],
                        lookup_by='download_url')
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=sha1_hash,
                        lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk,
                                              callback=subtask(extract_by_ocr),
                                              citation_countdown=random_delay)

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=doc.pk, name=site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
示例#5
0
        '48k', path_to_tmp_location
    ]
    try:
        _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError, e:
        print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \
              (avconv_command, e.returncode, e.output)
        print traceback.format_exc()
        raise

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    set_mp3_meta_data(af, path_to_tmp_location)

    af.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            file_name = trunc(af.case_name.lower(), 72) + '_cl.mp3'
            af.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (af.pk, traceback.format_exc())
            ErrorLog(log_level='CRITICAL', court=af.docket.court,
                     message=msg).save()

    af.processing_complete = True
    af.save()
    os.remove(path_to_tmp_location)
示例#6
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit(
                                '.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        audio_file.local_path_original_file.save(file_name,
                                                                 cf,
                                                                 save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async((audio_file.pk, ),
                                                   countdown=random_delay)

                    logger.info("Successfully added audio file %s: %s" %
                                (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
示例#7
0
    def setUp(self):
        court = Court.objects.get(pk='test')

        # Make some errors that we can tally
        ErrorLog(log_level='WARNING', court=court, message="test_msg").save()
        ErrorLog(log_level='CRITICAL', court=court, message="test_msg").save()