예제 #1
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        self.verbosity = int(options.get("verbosity", 1))
        daemon_mode = options.get("daemonmode", False)

        full_crawl = options.get("full_crawl", False)

        try:
            rate = int(options["rate"])
        except (ValueError, AttributeError, TypeError):
            rate = 30

        court_id = options.get("court_id")
        if not court_id:
            raise CommandError("You must specify a court as a package or " "module.")
        else:
            module_strings = build_module_list(court_id)
            if not len(module_strings):
                raise CommandError("Unable to import module or package. " "Aborting.")

            logger.info("Starting up the scraper.")
            num_courts = len(module_strings)
            wait = (rate * 60) / num_courts
            i = 0
            while i < num_courts:
                # this catches SIGTERM, so the code can be killed safely.
                if die_now:
                    logger.info("The scraper has stopped.")
                    sys.exit(1)

                package, module = module_strings[i].rsplit(".", 1)

                mod = __import__("%s.%s" % (package, module), globals(), locals(), [module])
                # noinspection PyBroadException
                try:
                    self.parse_and_scrape_site(mod, full_crawl)
                except Exception, e:
                    # noinspection PyBroadException
                    try:
                        msg = (
                            "********!! CRAWLER DOWN !!***********\n"
                            "*****scrape_court method failed!*****\n"
                            "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc()
                        )
                        logger.critical(msg)

                        # opinions.united_states.federal.ca9_u --> ca9
                        court_str = mod.Site.__module__.split(".")[-1].split("_")[0]
                        court = Court.objects.get(pk=court_str)
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                    except Exception, e:
                        # This is very important. Without this, an exception
                        # above will crash the caller.
                        pass
                finally:
예제 #2
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
예제 #3
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
예제 #4
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
예제 #5
0
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.',
                                                                       1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
예제 #7
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
예제 #8
0
def process_audio_file(pk):
    """Given the key to an audio file, extract its content and add the related
    meta data to the database.
    """
    audio_file = Audio.objects.get(pk=pk)
    path_to_original = audio_file.local_path_original_file.path

    path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3')

    # Convert original file to:
    #  - mono (-ac 1)
    #  - sample rate (audio samples / s) of 22050Hz (-ar 22050)
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = ['avconv', '-i', path_to_original,
                      '-ac', '1',
                      '-ar', '22050',
                      '-ab', '48k',
                      path_to_tmp_location]
    _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            logger.critical(msg)
            ErrorLog(log_level='CRITICAL', court=audio_file.docket.court,
                     message=msg).save()

    audio_file.processing_complete = True
    audio_file.save()
예제 #9
0
def process_audio_file(pk):
    """Given the key to an audio file, extract its content and add the related
    meta data to the database.
    """
    audio_file = Audio.objects.get(pk=pk)
    path_to_original = audio_file.local_path_original_file.path

    path_to_tmp_location = os.path.join('/tmp', str(time.time()) + '.mp3')

    # Convert original file to:
    #  - mono (-ac 1)
    #  - sample rate (audio samples / s) of 22050Hz (-ar 22050)
    #  - constant bit rate (sample resolution) of 48kbps (-ab 48k)
    avconv_command = [
        'avconv', '-i', path_to_original, '-ac', '1', '-ar', '22050', '-ab',
        '48k', path_to_tmp_location
    ]
    _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    file_name = trunc(audio_file.case_name.lower(), 72) + '_cl.mp3'
    set_mp3_meta_data(audio_file, path_to_tmp_location)

    audio_file.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            audio_file.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (audio_file.pk, traceback.format_exc())
            logger.critical(msg)
            ErrorLog(log_level='CRITICAL',
                     court=audio_file.docket.court,
                     message=msg).save()

    audio_file.processing_complete = True
    audio_file.save()
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i],
                    site.cookies,
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(r.content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' % site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    audio_file.docket = docket

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit('.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        audio_file.local_path_original_file.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async(
                        (audio_file.pk,),
                        countdown=random_delay
                    )

                    logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
예제 #11
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                sha1_hash = hashlib.sha1(r.content).hexdigest()
                if court_str == 'nev' and site.precedential_statuses[
                        i] == 'Unpublished':
                    # Nevada's non-precedential cases have different SHA1 sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=site.download_urls[i],
                        lookup_by='download_url')
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=sha1_hash,
                        lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk,
                                              callback=subtask(extract_by_ocr),
                                              citation_countdown=random_delay)

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=doc.pk, name=site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
예제 #12
0
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("Successfully added audio file %s: %s" %
                        (audio_file.pk, audio_file.case_name))
예제 #13
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level="WARNING", court=court, message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                if isinstance(content, unicode):
                    sha1_hash = hashlib.sha1(content.encode("utf-8")).hexdigest()
                else:
                    sha1_hash = hashlib.sha1(content).hexdigest()
                if court_str == "nev" and site.precedential_statuses[i] == "Unpublished":
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by="download_url"
                    )
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1"
                    )

                if onwards == "CONTINUE":
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == "BREAK":
                    # It's a duplicate, and we hit a date or dup_count
                    # threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == "CARRY_ON":
                    # Not a duplicate, carry on
                    logger.info("Adding new document found at: %s" % site.download_urls[i].encode("utf-8"))
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = "Unable to save binary to disk. Deleted " "document: % s.\n % s" % (
                            site.case_names[i],
                            traceback.format_exc(),
                        )
                        logger.critical(msg.encode("utf-8"))
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay)

                    logger.info(
                        "Successfully added doc {pk}: {name}".format(pk=doc.pk, name=site.case_names[i].encode("utf-8"))
                    )

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
예제 #14
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit(
                                '.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        audio_file.local_path_original_file.save(file_name,
                                                                 cf,
                                                                 save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async((audio_file.pk, ),
                                                   countdown=random_delay)

                    logger.info("Successfully added audio file %s: %s" %
                                (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
예제 #15
0
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("Successfully added audio file %s: %s" % (
                audio_file.pk, audio_file.case_name))
예제 #16
0
def scrape_court(site, full_crawl=False):
    download_error = False
    # Get the court object early for logging
    # opinions.united_states.federal.ca9_u --> ca9
    court_str = site.court_id.split('.')[-1].split('_')[0]
    court = Court.objects.get(pk=court_str)

    dup_checker = DupChecker(site.court_id, full_crawl=full_crawl)
    abort = dup_checker.abort_by_hash(site.hash)
    if not abort:
        for i in range(0, len(site.case_names)):
            msg, r = get_binary_content(site.download_urls[i], site._get_cookies())
            clean_content = site._cleanup_content(r.content)
            if msg:
                logger.warn(msg)
                ErrorLog(log_level='WARNING',
                         court=court,
                         message=msg).save()
                continue

            current_date = site.case_dates[i]
            try:
                next_date = site.case_dates[i + 1]
            except IndexError:
                next_date = None

            # Make a hash of the data. Need to convert unicode to binary before hashing.
            if type(clean_content) == unicode:
                hash_content = clean_content.encode('utf-8')
            else:
                hash_content = clean_content
            sha1_hash = hashlib.sha1(hash_content).hexdigest()
            if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished':
                # Nevada's non-precedential cases have different SHA1 sums every time.
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=site.download_urls[i],
                    lookup_by='download_url'
                )
            else:
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

            if onwards == 'CONTINUE':
                # It's a duplicate, but we haven't hit any thresholds yet.
                continue
            elif onwards == 'BREAK':
                # It's a duplicate, and we hit a date or dup_count threshold.
                dup_checker.update_site_hash(sha1_hash)
                break
            elif onwards == 'CARRY_ON':
                # Not a duplicate, carry on
                logger.info('Adding new document found at: %s' % site.download_urls[i])
                dup_checker.reset()

                # Make a citation
                cite = Citation(case_name=site.case_names[i])
                if site.docket_numbers:
                    cite.docket_number = site.docket_numbers[i]
                if site.neutral_citations:
                    cite.neutral_cite = site.neutral_citations[i]
                if site.west_citations:
                    cite.federal_cite_one = site.west_citations[i]
                if site.west_state_citations:
                    cite.west_state_cite = site.west_state_citations[i]

                # Make the document object
                doc = Document(source='C',
                               sha1=sha1_hash,
                               date_filed=site.case_dates[i],
                               court=court,
                               download_url=site.download_urls[i],
                               precedential_status=site.precedential_statuses[i])

                # Make and associate the file object
                try:
                    cf = ContentFile(clean_content)
                    extension = get_extension(r.content)
                    # See issue #215 for why this must be lower-cased.
                    file_name = trunc(site.case_names[i].lower(), 75) + extension
                    doc.local_path.save(file_name, cf, save=False)
                except:
                    msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                          (cite.case_name, traceback.format_exc())
                    logger.critical(msg)
                    ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                    download_error = True
                    continue

                if site.judges:
                    doc.judges = site.judges[i]
                if site.nature_of_suit:
                    doc.nature_of_suit = site.nature_of_suit[i]

                # Save everything, but don't update Solr index yet
                cite.save(index=False)
                doc.citation = cite
                doc.save(index=False)

                # Extract the contents asynchronously.
                extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))

                logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i]))

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled." % site.court_id)
        if not download_error and not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)