Пример #1
0
 def setUp(self):
     super(ReportScrapeStatusTest, self).setUp()
     self.court = Court.objects.get(pk="test")
     # Make some errors that we can tally
     ErrorLog(log_level="WARNING", court=self.court,
              message="test_msg").save()
     ErrorLog(log_level="CRITICAL", court=self.court,
              message="test_msg").save()
Пример #2
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item["blocked_statuses"]
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = item.get(
            "case_name_shorts"
        ) or self.cnt.make_case_name_short(item["case_names"])

        docket = Docket(
            docket_number=item.get("docket_numbers", ""),
            case_name=item["case_names"],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item["case_dates"],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get("judges", ""),
            source="C",
            case_name=item["case_names"],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item["download_urls"],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in [".mp3", ".wma"]:
                extension = (
                    "." + item["download_urls"].lower().rsplit(".", 1)[1]
                )
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item["case_names"].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = (
                "Unable to save binary to disk. Deleted audio file: %s.\n "
                "%s" % (item["case_names"], traceback.format_exc())
            )
            logger.critical(msg.encode("utf-8"))
            ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
            error = True

        return docket, audio_file, error
Пример #4
0
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.',
                                                                       1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
Пример #5
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
    def scrape_court(self, site, full_crawl=False, backscrape=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item["download_urls"],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method,
                )
                if msg:
                    logger.warning(msg)
                    ErrorLog(
                        log_level="WARNING", court=court, message=msg
                    ).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item["case_dates"]
                try:
                    next_date = site[i + 1]["case_dates"]
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                onwards = dup_checker.press_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by="sha1",
                )
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info(
                        "Adding new document found at: %s"
                        % item["download_urls"].encode("utf-8")
                    )
                    dup_checker.reset()

                    docket, audio_file, error = make_objects(
                        item, court, sha1_hash, content
                    )

                    if error:
                        download_error = True
                        continue

                    save_everything(
                        items={"docket": docket, "audio_file": audio_file},
                        index=False,
                        backscrape=backscrape,
                    )
                    process_audio_file.apply_async(
                        (audio_file.pk,), countdown=random.randint(0, 3600)
                    )

                    logger.info(
                        "Successfully added audio file {pk}: {name}".format(
                            pk=audio_file.pk,
                            name=item["case_names"].encode("utf-8"),
                        )
                    )

            # Update the hash if everything finishes properly.
            logger.info(
                "%s: Successfully crawled oral arguments." % site.court_id
            )
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(item['download_urls'],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item['case_dates']
                try:
                    next_date = site[i + 1]['case_dates']
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                if (court_str == 'nev'
                        and item['precedential_statuses'] == 'Unpublished'):
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    lookup_params = {
                        'lookup_value': item['download_urls'],
                        'lookup_by': 'download_url'
                    }
                else:
                    lookup_params = {
                        'lookup_value': sha1_hash,
                        'lookup_by': 'sha1'
                    }

                onwards = dup_checker.press_on(Opinion, current_date,
                                               next_date, **lookup_params)
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                item['download_urls'].encode('utf-8'))
                    dup_checker.reset()

                    docket, opinion, cluster, citations, error = self.make_objects(
                        item, court, sha1_hash, content)

                    if error:
                        download_error = True
                        continue

                    self.save_everything(items={
                        'docket': docket,
                        'opinion': opinion,
                        'cluster': cluster,
                        'citations': citations,
                    },
                                         index=False)
                    extract_doc_content.delay(
                        opinion.pk,
                        do_ocr=True,
                        citation_jitter=True,
                    )

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=opinion.pk,
                        name=item['case_names'].encode('utf-8'),
                    ))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
Пример #8
0
    def scrape_court(self, site, full_crawl=False, ocr_available=True):
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        if dup_checker.abort_by_url_hash(site.url, site.hash):
            return

        if site.cookies:
            logger.info(f"Using cookies: {site.cookies}")
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                method=site.method,
            )
            if msg:
                logger.warning(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            if (
                court_str == "nev"
                and item["precedential_statuses"] == "Unpublished"
            ):
                # Nevada's non-precedential cases have different SHA1 sums
                # every time.
                lookup_params = {
                    "lookup_value": item["download_urls"],
                    "lookup_by": "download_url",
                }
            else:
                lookup_params = {
                    "lookup_value": sha1_hash,
                    "lookup_by": "sha1",
                }

            proceed = dup_checker.press_on(
                Opinion, current_date, next_date, **lookup_params
            )
            if dup_checker.emulate_break:
                break
            if not proceed:
                continue

            # Not a duplicate, carry on
            logger.info(
                f"Adding new document found at: {item['download_urls'].encode()}"
            )
            dup_checker.reset()

            docket, opinion, cluster, citations = make_objects(
                item, court, sha1_hash, content
            )

            save_everything(
                items={
                    "docket": docket,
                    "opinion": opinion,
                    "cluster": cluster,
                    "citations": citations,
                },
                index=False,
            )
            extract_doc_content.delay(
                opinion.pk, ocr_available=ocr_available, citation_jitter=True
            )

            logger.info(
                f"Successfully added doc {opinion.pk}: {item['case_names'].encode()}"
            )

        # Update the hash if everything finishes properly.
        logger.info(f"{site.court_id}: Successfully crawled opinions.")
        if not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Пример #9
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item["blocked_statuses"]
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = item.get(
            "case_name_shorts") or self.cnt.make_case_name_short(
                item["case_names"])
        docket = Docket(
            docket_number=item.get("docket_numbers", ""),
            case_name=item["case_names"],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get("west_citations", "")
        state_cite_str = item.get("west_state_citations", "")
        neutral_cite_str = item.get("neutral_citations", "")
        cluster = OpinionCluster(
            judges=item.get("judges", ""),
            date_filed=item["case_dates"],
            date_filed_is_approximate=item["date_filed_is_approximate"],
            case_name=item["case_names"],
            case_name_short=case_name_short,
            source="C",
            precedential_status=item["precedential_statuses"],
            nature_of_suit=item.get("nature_of_suit", ""),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get("summaries", ""),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type=Opinion.COMBINED,
            sha1=sha1_hash,
            download_url=item["download_urls"],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item["case_names"].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ("Unable to save binary to disk. Deleted "
                   "item: %s.\n %s" %
                   (item["case_names"], traceback.format_exc()))
            logger.critical(msg.encode("utf-8"))
            ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
Пример #10
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options["court_id"])
        if not len(module_strings):
            raise CommandError("Unable to import module or package. Aborting.")

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options["rate"] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit(".", 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options["full_crawl"])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ("********!! CRAWLER DOWN !!***********\n"
                           "*****scrape_court method failed!*****\n"
                           "********!! ACTION NEEDED !!**********\n%s" %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split(".")[-1].split(
                        "_")[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level="CRITICAL", court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = i == (num_courts - 1)
                if last_court_in_list and options["daemon"]:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
Пример #11
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        if west_cite_str:
            citation_obj = get_citations(west_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.WEST,
                ))
        if state_cite_str:
            citation_obj = get_citations(state_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.STATE,
                ))
        if neutral_cite_str:
            citation_obj = get_citations(neutral_cite_str)[0]
            citations.append(
                Citation(
                    cluster=cluster,
                    volume=citation_obj.volume,
                    reporter=citation_obj.reporter,
                    page=citation_obj.page,
                    type=Citation.NEUTRAL,
                ))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
Пример #12
0
    try:
        _ = subprocess.check_output(avconv_command, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError, e:
        print 'avconv failed command: %s\nerror code: %s\noutput: %s\n' % \
              (avconv_command, e.returncode, e.output)
        print traceback.format_exc()
        raise

    # Have to do this last because otherwise the mp3 hasn't yet been generated.
    set_mp3_meta_data(af, path_to_tmp_location)

    af.duration = eyed3.load(path_to_tmp_location).info.time_secs

    with open(path_to_tmp_location, 'r') as mp3:
        try:
            cf = ContentFile(mp3.read())
            file_name = trunc(best_case_name(af).lower(), 72) + '_cl.mp3'
            af.file_with_date = af.docket.date_argued
            af.local_path_mp3.save(file_name, cf, save=False)
        except:
            msg = "Unable to save mp3 to audio_file in scraper.tasks.process_" \
                  "audio_file for item: %s\nTraceback:\n%s" % \
                  (af.pk, traceback.format_exc())
            print msg
            ErrorLog(log_level='CRITICAL', court=af.docket.court,
                     message=msg).save()

    af.processing_complete = True
    af.save()
    os.remove(path_to_tmp_location)