示例#1
0
    def handle(self, *args, **options):
        court_id = options.get('court_id')
        if not court_id:
            raise CommandError('You must specify a court as a package or module')
        else:
            module_strings = build_module_list(court_id)
            if not len(module_strings):
                raise CommandError('Unable to import module or package. Aborting.')

            logger.info("Starting up the scraper.")
            num_courts = len(module_strings)
            i = 0
            while i < num_courts:
                package, module = module_strings[i].rsplit('.', 1)

                mod = __import__("%s.%s" % (package, module),
                                 globals(),
                                 locals(),
                                 [module])
                # noinspection PyBroadException
                try:
                    self.back_scrape(mod)
                except Exception, e:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s') % traceback.format_exc()
                    logger.critical(msg)
                finally:
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        self.verbosity = int(options.get("verbosity", 1))
        daemon_mode = options.get("daemonmode", False)

        full_crawl = options.get("full_crawl", False)

        try:
            rate = int(options["rate"])
        except (ValueError, AttributeError, TypeError):
            rate = 30

        court_id = options.get("court_id")
        if not court_id:
            raise CommandError("You must specify a court as a package or " "module.")
        else:
            module_strings = build_module_list(court_id)
            if not len(module_strings):
                raise CommandError("Unable to import module or package. " "Aborting.")

            logger.info("Starting up the scraper.")
            num_courts = len(module_strings)
            wait = (rate * 60) / num_courts
            i = 0
            while i < num_courts:
                # this catches SIGTERM, so the code can be killed safely.
                if die_now:
                    logger.info("The scraper has stopped.")
                    sys.exit(1)

                package, module = module_strings[i].rsplit(".", 1)

                mod = __import__("%s.%s" % (package, module), globals(), locals(), [module])
                # noinspection PyBroadException
                try:
                    self.parse_and_scrape_site(mod, full_crawl)
                except Exception, e:
                    # noinspection PyBroadException
                    try:
                        msg = (
                            "********!! CRAWLER DOWN !!***********\n"
                            "*****scrape_court method failed!*****\n"
                            "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc()
                        )
                        logger.critical(msg)

                        # opinions.united_states.federal.ca9_u --> ca9
                        court_str = mod.Site.__module__.split(".")[-1].split("_")[0]
                        court = Court.objects.get(pk=court_str)
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                    except Exception, e:
                        # This is very important. Without this, an exception
                        # above will crash the caller.
                        pass
                finally:
def calculate_counts():
    """Grab the information for new documents over the past 30 days, and
    calculate the number of cases found for each court.

    Returns a list like so:
    [('ca1', date1, link), ('ca2', date2, link), ('ca3',...)]
    """
    thirty_days_ago = now() - timedelta(days=30)
    thirty_five_days_ago = now() - timedelta(days=35)
    cts_more_than_30_days = Court.objects \
        .filter(docket__documents__date_filed__gt=thirty_days_ago) \
        .annotate(count=Count('docket__documents__pk')) \
        .values('pk', 'count')

    # Needed because annotation calls above don't return courts with no new
    # opinions
    all_active_courts = Court.objects.filter(has_opinion_scraper=True) \
        .values_list('pk', flat=True).order_by('position')

    # Reformat the results into dicts...
    cts_more_than_30_days = _make_query_dict(cts_more_than_30_days)

    # Combine everything
    most_recent_opinions = []
    recently_dying_courts = []
    mod_list = importer.build_module_list('juriscraper.opinions')
    mod_dict = {}
    for v in mod_list:
        court = v.rsplit('.')[-1]
        mod_dict[court] = v

    for court in all_active_courts:
        if cts_more_than_30_days.get(court, 0) == 0:
            # No results in newer than 35 days. Get date of most recent
            # item.
            date_filed = Document.objects.filter(docket__court_id=court)\
                .order_by('-date_filed')[0].date_filed
            try:
                mod = __import__(
                    mod_dict[court],
                    globals(),
                    locals(),
                    [mod_dict[court].rsplit('.')[0]],
                )
                url = mod.Site().url
                method = mod.Site().method
            except KeyError:
                # Happens when multiple scrapers for single court.
                url = ""
                method = "Unknown"
            if thirty_five_days_ago.date() < date_filed < \
                    thirty_days_ago.date():
                recently_dying_courts.append((court, date_filed, method, url))
            most_recent_opinions.append((court, date_filed, method, url))

    # Sort by date (index 1)
    most_recent_opinions.sort(key=itemgetter(1), reverse=True)

    return most_recent_opinions, recently_dying_courts
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
示例#5
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options["court_id"])
        if not len(module_strings):
            raise CommandError("Unable to import module or package. Aborting.")

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options["rate"] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit(".", 1)

            mod = __import__(
                f"{package}.{module}", globals(), locals(), [module]
            )
            try:
                self.parse_and_scrape_site(mod, options["full_crawl"])
            except Exception as e:
                capture_exception(e)
            last_court_in_list = i == (num_courts - 1)
            daemon_mode = options["daemon"]
            if last_court_in_list:
                if not daemon_mode:
                    break
                else:
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled."
                    )
                    i = 0
            else:
                i += 1
            time.sleep(wait)

        logger.info("The scraper has stopped.")
示例#6
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
示例#7
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        count = len([s for s in module_strings if 'backscraper' not in s])
        print "Testing {count} scrapers against their example files:".format(
            count=count)
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write('  %s ' % module_string)
                sys.stdout.flush(
                )  # Makes sure the output prints before the error message.
                paths = glob.glob('%s_example*' %
                                  module_string.replace('.', '/'))
                self.assertTrue(
                    paths, "No example file found for: %s!" %
                    module_string.rsplit('.', 1)[1])
                t1 = time.time()
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site()
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    # do-nothing function, b/c we don't want to iterate over
                    # items in a DeferringList. Otherwise, this function is
                    # called as part of the parse() function.
                    site._clean_attributes = lambda *a: None
                    site.parse()
                t2 = time.time()
                if t2 - t1 > 2:
                    msg = " - WARNING: Slow scraper!"
                else:
                    msg = ' - OK'
                print '(%0.1f seconds%s)' % ((t2 - t1), msg)
示例#8
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        count = len([s for s in module_strings if 'backscraper' not in s])
        print "Testing {count} scrapers against their example files:".format(
            count=count)
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write('  %s ' % module_string)
                sys.stdout.flush()  # Makes sure the output prints before the error message.
                paths = glob.glob(
                    '%s_example*' % module_string.replace('.', '/'))
                self.assertTrue(paths, "No example file found for: %s!" %
                                module_string.rsplit('.', 1)[1])
                t1 = time.time()
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site()
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    # do-nothing function, b/c we don't want to iterate over
                    # items in a DeferringList. Otherwise, this function is
                    # called as part of the parse() function.
                    site._clean_attributes = lambda *a: None
                    site.parse()
                t2 = time.time()
                if t2 - t1 > 2:
                    msg = " - WARNING: Slow scraper!"
                else:
                    msg = ' - OK'
                print '(%0.1f seconds%s)' % ((t2 - t1), msg)
示例#9
0
def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n'
             'To test ca1, downloading binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n'
             'To test all federal courts, omitting binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate')
    parser = OptionParser(usage)
    parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID",
                      help=('The court(s) to scrape and extract. This should be in '
                            'the form of a python module or package import '
                            'from the Juriscraper library, e.g. '
                            '"juriscraper.opinions.united_states.federal.ca1" or '
                            'simply "opinions" to do all opinions. If desired, '
                            'you can use slashes instead of dots to separate'
                            'the import path.'))
    parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode',
                      default=False, help=('Use this flag to turn on daemon '
                                           'mode, in which all courts requested '
                                           'will be scraped in turn, non-stop.'))
    parser.add_option('-b', '--download_binaries', action='store_true',
                      dest='binaries',
                      default=False,
                      help=('Use this flag if you wish to download the pdf, '
                            'wpd, and doc files.'))
    parser.add_option('-v',
                      '--verbosity',
                      action='count',
                      default=1,
                      help='Increase output verbosity (e.g., -vv is more than -v).')
    parser.add_option('--backscrape',
                      dest='backscrape',
                      action='store_true',
                      default=False,
                      help='Download the historical corpus using the _download_backwards method.')

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape

    # Set up the print function
    print("Verbosity is set to: %s" % options.verbosity)
    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print(verb_args[1])

    global v_print
    v_print = _v_print

    if not court_id:
        parser.error('You must specify a court as a package or module.')
    else:
        court_id = court_id.replace('/', '.')
        if court_id.endswith('.py'):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error('Unable to import module or package. Aborting.')

        v_print(3, 'Starting up the scraper.')
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, 'The scraper has stopped.')
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable, mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3, 'Sent %s request to: %s' % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    scrape_court(site, binaries)
            except Exception:
                v_print(3, '*************!! CRAWLER DOWN !!****************')
                v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i])
                v_print(3, '*************!! ACTION NEEDED !!***************')
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = (i == (num_courts - 1))
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, 'The scraper has stopped.')
    sys.exit(0)
示例#10
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options["court_id"])
        if not len(module_strings):
            raise CommandError("Unable to import module or package. Aborting.")

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options["rate"] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit(".", 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options["full_crawl"])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ("********!! CRAWLER DOWN !!***********\n"
                           "*****scrape_court method failed!*****\n"
                           "********!! ACTION NEEDED !!**********\n%s" %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split(".")[-1].split(
                        "_")[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level="CRITICAL", court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = i == (num_courts - 1)
                if last_court_in_list and options["daemon"]:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
def calculate_counts():
    """Grab the information for new documents over the past 30 days, and
    calculate the number of cases found for each court.

    Returns a list like so:
    [('ca1', date1, link), ('ca2', date2, link), ('ca3',...)]
    """
    thirty_days_ago = now() - timedelta(days=30)
    thirty_five_days_ago = now() - timedelta(days=35)
    cts_more_than_30_days = Court.objects \
        .filter(docket__documents__date_filed__gt=thirty_days_ago) \
        .annotate(count=Count('docket__documents__pk')) \
        .values('pk', 'count')

    # Needed because annotation calls above don't return courts with no new
    # opinions
    all_active_courts = Court.objects.filter(has_opinion_scraper=True) \
        .values_list('pk', flat=True).order_by('position')

    # Reformat the results into dicts...
    cts_more_than_30_days = _make_query_dict(cts_more_than_30_days)

    # Combine everything
    most_recent_opinions = []
    recently_dying_courts = []
    mod_list = importer.build_module_list('juriscraper.opinions')
    mod_dict = {}
    for v in mod_list:
        court = v.rsplit('.')[-1]
        mod_dict[court] = v

    for court in all_active_courts:
        if cts_more_than_30_days.get(court, 0) == 0:
            # No results in newer than 35 days. Get date of most recent
            # item.
            date_filed = Document.objects.filter(docket__court_id=court)\
                .order_by('-date_filed')[0].date_filed
            try:
                mod = __import__(
                    mod_dict[court],
                    globals(),
                    locals(),
                    [mod_dict[court].rsplit('.')[0]],
                )
                url = mod.Site().url
                method = mod.Site().method
            except KeyError:
                # Happens when multiple scrapers for single court.
                url = ""
                method = "Unknown"
            if thirty_five_days_ago.date() < date_filed < \
                    thirty_days_ago.date():
                recently_dying_courts.append(
                    (court, date_filed, method, url)
                )
            most_recent_opinions.append(
                (court, date_filed, method, url)
            )

    # Sort by date (index 1)
    most_recent_opinions.sort(key=itemgetter(1), reverse=True)

    return most_recent_opinions, recently_dying_courts
示例#12
0
def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n'
             'To test ca1, downloading binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n'
             'To test all federal courts, omitting binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate')
    parser = OptionParser(usage)
    parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID",
                      help=('The court(s) to scrape and extract. This should be in '
                            'the form of a python module or package import '
                            'from the Juriscraper library, e.g. '
                            '"juriscraper.opinions.united_states.federal.ca1" or '
                            'simply "opinions" to do all opinions. If desired, '
                            'you can use slashes instead of dots to separate'
                            'the import path.'))
    parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode',
                      default=False, help=('Use this flag to turn on daemon '
                                           'mode, in which all courts requested '
                                           'will be scraped in turn, non-stop.'))
    parser.add_option('-b', '--download_binaries', action='store_true',
                      dest='binaries',
                      default=False,
                      help=('Use this flag if you wish to download the pdf, '
                            'wpd, and doc files.'))
    parser.add_option('-v',
                      '--verbosity',
                      action='count',
                      default=1,
                      help='Increase output verbosity (e.g., -vv is more than -v).')
    parser.add_option('--backscrape',
                      dest='backscrape',
                      action='store_true',
                      default=False,
                      help='Download the historical corpus using the _download_backwards method.')

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape

    # Set up the print function
    print "Verbosity is set to: %s" % options.verbosity
    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print verb_args[1]

    global v_print
    v_print = _v_print

    if not court_id:
        parser.error('You must specify a court as a package or module.')
    else:
        court_id = court_id.replace('/', '.')
        if court_id.endswith('.py'):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error('Unable to import module or package. Aborting.')

        v_print(3, 'Starting up the scraper.')
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, 'The scraper has stopped.')
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable, mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3, 'Sent %s request to: %s' % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    scrape_court(site, binaries)
            except Exception:
                v_print(3, '*************!! CRAWLER DOWN !!****************')
                v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i])
                v_print(3, '*************!! ACTION NEEDED !!***************')
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = (i == (num_courts - 1))
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, 'The scraper has stopped.')
    sys.exit(0)
示例#13
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = glob.glob('%s_example*' % example_path)
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None and not IS_TRAVIS:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
示例#15
0
def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = (
        "usage: %prog -c COURTID [-d|--daemon] [-b|--binaries] [-r|--report]\n\n"
        "To test ca1, downloading binaries, use: \n"
        "    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n"
        "To test all federal courts, omitting binaries, use: \n"
        "    python %prog -c opinions.united_states.federal_appellate"
        "Passing the --report option will generate an HTML report in "
        "the root directory after scrapers have run")
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        "--courts",
        dest="court_id",
        metavar="COURTID",
        help=("The court(s) to scrape and extract. This should be in "
              "the form of a python module or package import "
              "from the Juriscraper library, e.g. "
              '"juriscraper.opinions.united_states.federal.ca1" or '
              'simply "opinions" to do all opinions. If desired, '
              "you can use slashes instead of dots to separate"
              "the import path."),
    )
    parser.add_option(
        "-d",
        "--daemon",
        action="store_true",
        dest="daemonmode",
        default=False,
        help=("Use this flag to turn on daemon "
              "mode, in which all courts requested "
              "will be scraped in turn, non-stop."),
    )
    parser.add_option(
        "-b",
        "--download_binaries",
        action="store_true",
        dest="binaries",
        default=False,
        help=("Use this flag if you wish to download the pdf, "
              "wpd, and doc files."),
    )
    parser.add_option(
        "-v",
        "--verbosity",
        action="count",
        default=1,
        help="Increase output verbosity (e.g., -vv is more than -v).",
    )
    parser.add_option(
        "--backscrape",
        dest="backscrape",
        action="store_true",
        default=False,
        help=
        "Download the historical corpus using the _download_backwards method.",
    )
    parser.add_option(
        "-r",
        "--report",
        action="store_true",
        default=False,
        help="Generate a report.html with the outcome of running the scrapers",
    )

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape
    generate_report = options.report

    # Set up the print function
    print("Verbosity is set to: %s" % options.verbosity)

    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print(verb_args[1])

    global v_print
    v_print = _v_print

    results = {}

    if not court_id:
        parser.error("You must specify a court as a package or module.")
    else:
        court_id = court_id.replace("/", ".")
        if court_id.endswith(".py"):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error("Unable to import module or package. Aborting.")

        v_print(3, "Starting up the scraper.")
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            current_court = module_strings[i]
            results[current_court] = {"global_failure": False}
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, "The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit(".", 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable,
                                             mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3,
                            "Sent %s request to: %s" % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    results[current_court]["scrape"] = scrape_court(
                        site, binaries)
            except Exception:
                results[current_court][
                    "global_failure"] = traceback.format_exc()
                results[current_court]["scrape"] = {}
                v_print(3, "*************!! CRAWLER DOWN !!****************")
                v_print(
                    3,
                    "*****scrape_court method failed on mod: %s*****" %
                    module_strings[i],
                )
                v_print(3, "*************!! ACTION NEEDED !!***************")
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = i == (num_courts - 1)
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, "The scraper has stopped.")

    if generate_report:
        report_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../report.html"))
        v_print(3, "Generating HTML report at %s" % report_path)
        generate_scraper_report(report_path, results)

    sys.exit(0)
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        json_compare_files_generated = []
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.enable_test_mode()
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        warn_generated_compare_file(json_path)
                        json_compare_files_generated.append(json_path)
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()
                duration = t2 - t1
                warning_msg = warn_or_crash_slow_parser(t2 - t1)
                if warning_msg:
                    num_warnings += 1

                print('(%s test(s) in %0.1f seconds)' %
                      (num_tests, duration))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if json_compare_files_generated:
            msg = 'Generated compare file(s) during test, please review before proceeding. ' \
                  'If the data looks good, run tests again, then be sure to include ' \
                  'the new compare file(s) in your commit: %s'
            self.fail(msg % ', '.join(json_compare_files_generated))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
示例#17
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        print "Testing {count} scrapers against their example files:".format(
            count=num_scrapers)
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                paths = glob.glob(
                    '%s_example*' % module_string.replace('.', '/'))
                self.assertTrue(paths, "No example file found for: %s!" %
                                module_string.rsplit('.', 1)[1])
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site()
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                t2 = time.time()

                max_speed = 10
                warn_speed = 1
                speed = t2 - t1
                if speed > max_speed:
                    raise SlownessException(
                        "This scraper took {speed}s to test, which is more "
                        "than the allowed speed of {max_speed}s. "
                        "Please speed it up for tests to pass.".format(
                            speed=speed,
                            max_speed=max_speed,
                        ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print '(%s test(s) in %0.1f seconds%s)' % (
                    num_tests, speed, msg
                )

        print ("\n{num_scrapers} scrapers tested successfully against "
               "{num_example_files} example files, with {num_warnings} "
               "speed warnings.".format(
            num_scrapers=num_scrapers,
            num_example_files=num_example_files,
            num_warnings=num_warnings,
        ))
        if num_warnings:
            print ("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print "\nNo speed warnings detected. That's great, keep up the " \
                  "good work!"
示例#18
0
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        msg = "Testing {count} scrapers against their example files:"
        print(msg.format(count=num_scrapers))
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.method = 'LOCAL'
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = site.to_json()
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        error = ('The result of parsing ' + example_file +
                                 ' does not match the expected data in ' +
                                 compare_file + '. Either the later has ' +
                                 'bad data or recent changes to this scraper ' +
                                 'are incompatible with the ' + example_file +
                                 ' use case. PARSED JSON: ' + json_data)
                        with open(json_path, 'r') as input_file:
                            self.assertEqual(input_file.read(), json_data, error)
                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        with open(json_path, 'w') as json_example:
                            json_example.write(json_data)
                t2 = time.time()

                max_speed = 15
                warn_speed = 1
                speed = t2 - t1
                msg = ''
                if speed > max_speed:
                    if sys.gettrace() is None and not IS_TRAVIS:
                        # Only do this if we're not debugging. Debuggers make
                        # things slower and breakpoints make things stop.
                        raise SlownessException(
                            "This scraper took {speed}s to test, which is more "
                            "than the allowed speed of {max_speed}s. "
                            "Please speed it up for tests to pass.".format(
                                speed=speed,
                                max_speed=max_speed,
                            ))
                elif speed > warn_speed:
                    msg = ' - WARNING: SLOW SCRAPER'
                    num_warnings += 1
                else:
                    msg = ''

                print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")