Пример #1
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        if options['sweep'] is False:
            # Only allow one script at a time per court combination.
            # Note that multiple scripts on multiple machines could still be
            # run.
            court_str = '-'.join(sorted(options['courts']))
            with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp:
                try:
                    fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except IOError:
                    print("Another instance of this program is running with "
                          "for this combination of courts. Only one instance "
                          "can crawl these courts at a time: '%s'" % court_str)
                    sys.exit(1)

        # Loop over the PACER sites that have RSS feeds and see if they're
        # ready to do.
        courts = Court.objects.filter(
            jurisdiction__in=[
                Court.FEDERAL_BANKRUPTCY,
                Court.FEDERAL_DISTRICT,
            ],
            pacer_has_rss_feed=True,
        )
        if options['courts'] != ['all']:
            courts = courts.filter(pk__in=options['courts'])

        iterations_completed = 0
        last_trim_date = None
        while options['iterations'] == 0 or \
                iterations_completed < options['iterations']:
            for court in courts:
                # Check the last time we successfully got the feed
                try:
                    feed_status = RssFeedStatus.objects.filter(
                        court=court,
                        is_sweep=options['sweep'],
                        status__in=[
                            RssFeedStatus.PROCESSING_SUCCESSFUL,
                            RssFeedStatus.UNCHANGED,
                            RssFeedStatus.PROCESSING_IN_PROGRESS,
                        ]
                    ).latest('date_created')
                except RssFeedStatus.DoesNotExist:
                    # First time running it or status items have been nuked by
                    # an admin. Make a dummy object, but no need to actually
                    # save it to the DB. Make it old.
                    lincolns_birthday = make_aware(datetime(1809, 2, 12))
                    feed_status = RssFeedStatus(
                        date_created=lincolns_birthday,
                        date_last_build=lincolns_birthday,
                        is_sweep=options['sweep'],
                    )
                if options['courts'] == ['all'] and options['sweep'] is False:
                    # If it's all courts and it's not a sweep, check if we did
                    # it recently.
                    max_visit_ago = now() - timedelta(
                        seconds=self.RSS_MAX_VISIT_FREQUENCY)
                    if feed_status.date_created > max_visit_ago:
                        # Processed too recently. Try next court.
                        continue

                # Give a court some time to complete during non-sweep crawls
                processing_cutoff = now() - timedelta(
                    seconds=self.RSS_MAX_PROCESSING_DURATION)
                if all([
                    options['sweep'] is False,
                    feed_status.status == RssFeedStatus.PROCESSING_IN_PROGRESS,
                    feed_status.date_created > processing_cutoff
                ]):
                    continue

                # The court is ripe! Crawl it if it has changed.
                # Make a new object to track the attempted crawl.
                new_status = RssFeedStatus.objects.create(
                    court_id=court.pk,
                    status=RssFeedStatus.PROCESSING_IN_PROGRESS,
                    is_sweep=options['sweep'],
                )

                # Check if the item needs crawling, and crawl it if so.
                chain(
                    check_if_feed_changed.s(court.pk, new_status.pk,
                                            feed_status.date_last_build),
                    merge_rss_feed_contents.s(court.pk, new_status.pk),
                    send_docket_alerts.s(),
                    # Update recap *documents*, not *dockets*. Updating dockets
                    # requires much more work, and we don't expect to get much
                    # docket information from the RSS feeds. RSS feeds also
                    # have information about hundreds or thousands of
                    # dockets. Updating them all would be very bad.
                    add_items_to_solr.s('search.RECAPDocument'),
                    mark_status_successful.si(new_status.pk),
                ).apply_async()

            # Trim if not too recently trimmed.
            trim_cutoff_date = now() - timedelta(
                seconds=self.DELAY_BETWEEN_CACHE_TRIMS)
            if last_trim_date is None or trim_cutoff_date > last_trim_date:
                trim_rss_cache.delay()
                last_trim_date = now()

            # Wait, then attempt the courts again if iterations not exceeded.
            iterations_completed += 1
            time.sleep(self.DELAY_BETWEEN_ITERATIONS)
Пример #2
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        if options["sweep"] is False:
            # Only allow one script at a time per court combination.
            # Note that multiple scripts on multiple machines could still be
            # run.
            court_str = "-".join(sorted(options["courts"]))
            with open(f"/tmp/rss-scraper-{court_str}.pid", "w") as fp:
                try:
                    fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except IOError:
                    print("Another instance of this program is running with "
                          "this combination of courts. Only one instance "
                          "can crawl these courts at a time: '%s'" % court_str)
                    sys.exit(1)

        # Loop over the PACER sites that have RSS feeds and see if they're
        # ready to do.
        courts = Court.federal_courts.district_pacer_courts().filter(
            pacer_has_rss_feed=True, )
        if options["courts"] != ["all"]:
            courts = courts.filter(pk__in=options["courts"])

        iterations_completed = 0
        last_trim_date = None
        while (options["iterations"] == 0
               or iterations_completed < options["iterations"]):
            for court in courts:
                # Check the last time we successfully got the feed
                try:
                    feed_status = RssFeedStatus.objects.filter(
                        court=court,
                        is_sweep=options["sweep"],
                        status__in=[
                            RssFeedStatus.PROCESSING_SUCCESSFUL,
                            RssFeedStatus.UNCHANGED,
                            RssFeedStatus.PROCESSING_IN_PROGRESS,
                        ],
                    ).latest("date_created")
                except RssFeedStatus.DoesNotExist:
                    # First time running it or status items have been nuked by
                    # an admin. Make a dummy object, but no need to actually
                    # save it to the DB. Make it old.
                    lincolns_birthday = make_aware(datetime(1809, 2, 12))
                    feed_status = RssFeedStatus(
                        date_created=lincolns_birthday,
                        date_last_build=lincolns_birthday,
                        is_sweep=options["sweep"],
                    )
                if options["courts"] == ["all"] and options["sweep"] is False:
                    # If it's all courts and it's not a sweep, check if we did
                    # it recently.
                    max_visit_ago = now() - timedelta(
                        seconds=self.RSS_MAX_VISIT_FREQUENCY)
                    if feed_status.date_created > max_visit_ago:
                        # Processed too recently. Try next court.
                        continue

                # Don't crawl a court if it says it's been in progress just a
                # little while. It's probably queued and on the way.
                processing_cutoff = now() - timedelta(
                    seconds=self.RSS_MAX_PROCESSING_DURATION)
                if all([
                        options["sweep"] is False,
                        feed_status.status ==
                        RssFeedStatus.PROCESSING_IN_PROGRESS,
                        feed_status.date_created > processing_cutoff,
                ]):
                    continue

                # The court is ripe! Crawl it if it has changed.
                # Make a new object to track the attempted crawl.
                new_status = RssFeedStatus.objects.create(
                    court_id=court.pk,
                    status=RssFeedStatus.PROCESSING_IN_PROGRESS,
                    is_sweep=options["sweep"],
                )

                # Check if the item needs crawling, and crawl it if so.
                chain(
                    check_if_feed_changed.s(court.pk, new_status.pk,
                                            feed_status.date_last_build),
                    merge_rss_feed_contents.s(court.pk),
                    send_docket_alerts.s(),
                    # Update recap *documents*, not *dockets*. Updating dockets
                    # requires much more work, and we don't expect to get much
                    # docket information from the RSS feeds. RSS feeds also
                    # have information about hundreds or thousands of
                    # dockets. Updating them all would be very bad.
                    add_items_to_solr.s("search.RECAPDocument"),
                    mark_status_successful.si(new_status.pk),
                ).apply_async()

            # Trim if not too recently trimmed.
            trim_cutoff_date = now() - timedelta(
                seconds=self.DELAY_BETWEEN_CACHE_TRIMS)
            if last_trim_date is None or trim_cutoff_date > last_trim_date:
                trim_rss_data.delay()
                last_trim_date = now()

            # Wait, then attempt the courts again if iterations not exceeded.
            iterations_completed += 1
            remaining_iterations = options["iterations"] - iterations_completed
            if remaining_iterations > 0:
                time.sleep(self.DELAY_BETWEEN_ITERATIONS)
Пример #3
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        if options['sweep'] is False:
            # Only allow one script at a time per court combination.
            # Note that multiple scripts on multiple machines could still be
            # run.
            court_str = '-'.join(sorted(options['courts']))
            with open('/tmp/rss-scraper-%s.pid' % court_str, 'w') as fp:
                try:
                    fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except IOError:
                    print("Another instance of this program is running with "
                          "for this combination of courts. Only one instance "
                          "can crawl these courts at a time: '%s'" % court_str)
                    sys.exit(1)

        # Loop over the PACER sites that have RSS feeds and see if they're
        # ready to do.
        courts = Court.objects.filter(
            jurisdiction__in=[
                Court.FEDERAL_BANKRUPTCY,
                Court.FEDERAL_DISTRICT,
            ],
            pacer_has_rss_feed=True,
        )
        if options['courts'] != ['all']:
            courts = courts.filter(pk__in=options['courts'])

        iterations_completed = 0
        last_trim_date = None
        while options['iterations'] == 0 or \
                iterations_completed < options['iterations']:
            for court in courts:
                # Check the last time we successfully got the feed
                try:
                    feed_status = RssFeedStatus.objects.filter(
                        court=court,
                        is_sweep=options['sweep'],
                        status__in=[
                            RssFeedStatus.PROCESSING_SUCCESSFUL,
                            RssFeedStatus.UNCHANGED,
                            RssFeedStatus.PROCESSING_IN_PROGRESS,
                        ]
                    ).latest('date_created')
                except RssFeedStatus.DoesNotExist:
                    # First time running it or status items have been nuked by
                    # an admin. Make a dummy object, but no need to actually
                    # save it to the DB. Make it old.
                    lincolns_birthday = make_aware(datetime(1809, 2, 12))
                    feed_status = RssFeedStatus(
                        date_created=lincolns_birthday,
                        date_last_build=lincolns_birthday,
                        is_sweep=options['sweep'],
                    )
                if options['courts'] == ['all'] and options['sweep'] is False:
                    # If it's all courts and it's not a sweep, check if we did
                    # it recently.
                    max_visit_ago = now() - timedelta(
                        seconds=self.RSS_MAX_VISIT_FREQUENCY)
                    if feed_status.date_created > max_visit_ago:
                        # Processed too recently. Try next court.
                        continue

                # Give a court some time to complete during non-sweep crawls
                processing_cutoff = now() - timedelta(
                    seconds=self.RSS_MAX_PROCESSING_DURATION)
                if all([
                    options['sweep'] is False,
                    feed_status.status == RssFeedStatus.PROCESSING_IN_PROGRESS,
                    feed_status.date_created < processing_cutoff
                ]):
                    continue

                # The court is ripe! Crawl it if it has changed.
                # Make a new object to track the attempted crawl.
                new_status = RssFeedStatus.objects.create(
                    court_id=court.pk,
                    status=RssFeedStatus.PROCESSING_IN_PROGRESS,
                    is_sweep=options['sweep'],
                )

                # Check if the item needs crawling, and crawl it if so.
                chain(
                    check_if_feed_changed.s(court.pk, new_status.pk,
                                            feed_status.date_last_build),
                    merge_rss_feed_contents.s(court.pk, new_status.pk),
                    # Update recap *documents*, not *dockets*. Updating dockets
                    # requires much more work, and we don't expect to get much
                    # docket information from the RSS feeds. RSS feeds also
                    # have information about hundreds or thousands of
                    # dockets. Updating them all would be very bad.
                    add_or_update_recap_document.s(),
                    mark_status_successful.si(new_status.pk),
                ).apply_async()

            # Trim if not too recently trimmed.
            trim_cutoff_date = now() - timedelta(
                seconds=self.DELAY_BETWEEN_CACHE_TRIMS)
            if last_trim_date is None or trim_cutoff_date > last_trim_date:
                trim_rss_cache.delay()
                last_trim_date = now()

            # Wait, then attempt the courts again if iterations not exceeded.
            iterations_completed += 1
            time.sleep(self.DELAY_BETWEEN_ITERATIONS)