Пример #1
def get_or_cache_pacer_cookies(user_pk, username, password):
    """Get PACER cookies for a user or create and cache fresh ones

    For the PACER Fetch API, we store users' PACER cookies in Redis with a
    short expiration timeout. This way, we never store their password, and
    we only store their cookies temporarily.

    This function attempts to get cookies for a user from Redis. If it finds
    them, it returns them. If not, it attempts to log the user in and then
    returns the fresh cookies (after caching them).

    :param user_pk: The PK of the user attempting to store their credentials.
    Needed to create the key in Redis.
    :param username: The PACER username of the user
    :param password: The PACER password of the user
    :return: Cookies for the PACER user
    r = make_redis_interface("CACHE")
    cookies = get_pacer_cookie_from_cache(user_pk, r=r)
    if cookies:
        return cookies

    # Unable to find cookies in cache. Login and cache new values.
    cookies = log_into_pacer(username, password)
    cookie_expiration = 60 * 60
    r.set(session_key % user_pk, pickle.dumps(cookies), ex=cookie_expiration)
    return cookies
Пример #2
def get_count_for_endpoint(endpoint, start, end):
    """Get the count of hits for an endpoint by name, during a date range

    :param endpoint: The endpoint to get the count for. Typically something
    like 'docket-list' or 'docket-detail'
    :param start: The beginning date (inclusive) you want the results for. A
    string to be interpreted by dateparser
    :param end: The end date (inclusive) you want the results for. A string to
    be interpreted by dateparser.
    :return int: The count for that endpoint
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    dates = [
        d.date().isoformat() for d in rrule(
            dtstart=parser.parse(start, fuzzy=False),
            until=parser.parse(end, fuzzy=False),
    for d in dates:
        pipe.zscore("api:v3.endpoint.d:%s.counts" % d, endpoint)
    results = pipe.execute()
    return sum(r for r in results if r)
Пример #3
    def _log_request(self, request):
        d = date.today().isoformat()
        user = request.user
        endpoint = resolve(request.path_info).url_name
        response_ms = self._get_response_ms()

        r = make_redis_interface("STATS")
        pipe = r.pipeline()

        # Global and daily tallies for all URLs.
        pipe.incr("api:v3.d:%s.count" % d)
        pipe.incr("api:v3.timing", response_ms)
        pipe.incr("api:v3.d:%s.timing" % d, response_ms)

        # Use a sorted set to store the user stats, with the score representing
        # the number of queries the user made total or on a given day.
        user_pk = user.pk or "AnonymousUser"
        pipe.zincrby("api:v3.user.counts", 1, user_pk)
        pipe.zincrby("api:v3.user.d:%s.counts" % d, 1, user_pk)

        # Use a sorted set to store all the endpoints with score representing
        # the number of queries the endpoint received total or on a given day.
        pipe.zincrby("api:v3.endpoint.counts", 1, endpoint)
        pipe.zincrby("api:v3.endpoint.d:%s.counts" % d, 1, endpoint)

        # We create a per-day key in redis for timings. Inside the key we have
        # members for every endpoint, with score of the total time. So to get
        # the average for an endpoint you need to get the number of requests
        # and the total time for the endpoint and divide.
        timing_key = "api:v3.endpoint.d:%s.timings" % d
        pipe.zincrby(timing_key, response_ms, endpoint)

        results = pipe.execute()
        return results
Пример #4
def get_homepage_stats():
    """Get any stats that are displayed on the homepage and return them as a
    r = make_redis_interface("STATS")
    ten_days_ago = make_aware(datetime.today() - timedelta(days=10), utc)
    last_ten_days = [
        "api:v3.d:%s.count" % (date.today() - timedelta(days=x)).isoformat()
        for x in range(0, 10)
    homepage_data = {
            int(result) for result in r.mget(*last_ten_days)
            if result is not None
        ).annotate(Count("clusters"), ).filter(
            # Ensures that we only show good stuff on homepage
            clusters__count__gt=10, ).order_by(
        False,  # VERY IMPORTANT!
    return homepage_data
Пример #6
def clear_queue(queue_name: str):
    """Empty out a queue, nuking the tasks in it."""
    priority_names = [
        make_queue_name_for_pri(queue_name, pri)
        for pri in DEFAULT_PRIORITY_STEPS
    r = make_redis_interface("CELERY")
    return sum([r.delete(x) for x in priority_names])
Пример #7
def make_lasc_search():
    """Create a logged-in LASCSearch object with cookies pulled from cache

    :return: LASCSearch object
    r = make_redis_interface('CACHE')
    session = LASCSession()
    session.cookies = pickle.loads(r.get(LASC_SESSION_COOKIE_KEY))
    return LASCSearch(session)
def add_all_cases_to_cl(
    options: Dict[str, Union[List[str], int, str, float]], ) -> None:
    """Iterate over courts and gather iquery results from them.

    :param options: The options from the handle method
    :return None
    q = options["queue"]
    r = make_redis_interface("CACHE")
    # This is a simple dictionary that's populated with the maximum
    # pacer_case_id in the CL DB as of 2021-01-18. The idea is to use this to
    # prevent the scraper from going forever. You can reset it by querying the
    # latest item in the DB by date_filed, and then using r.hmset to save it.
    max_ids = r.hgetall("iquery_max_ids")

    courts = Court.federal_courts.district_pacer_courts().exclude(
        pk__in=["uscfc", "arb", "cit"])
    if options["courts"] != ["all"]:
        courts = courts.filter(pk__in=options["courts"])
    court_ids = list(courts.values_list("pk", flat=True))

    # Create a queue that's a bit longer than the number of courts we're doing
    throttle = CeleryThrottle(queue_name=q, min_items=len(court_ids) * 2)

    iterations_completed = 0
    db_key_cycle = itertools.cycle(settings.DATABASES.keys())
    while (options["iterations"] == 0
           or iterations_completed < options["iterations"]):
        if len(court_ids) == 0:
            # No more courts. Done!

        for court_id in court_ids:
                pacer_case_id = r.hincrby("iquery_status", court_id, 1)
                if pacer_case_id > int(max_ids[court_id]):
                    # Enough scraping. Stop doing this court.
                    # Adjust the throttle queue to be shorter.
                    throttle.set_min(len(court_ids * 2))
                    args=(court_id, pacer_case_id, next(db_key_cycle)),
            except Exception as e:
                # Cleanup
                r.hincrby("iquery_status", court_id, -1)
                raise e

        iterations_completed += 1
        remaining_iterations = options["iterations"] - iterations_completed
        if remaining_iterations > 0:
def get_pacer_cookie_from_cache(user_pk, r=None):
    """Get the cookie for a user from the cache.

    :param r: A redis interface. If not provided, a fresh one is used. This is
    a performance enhancement.
    :return Either None if no cache cookies or the cookies if they're found.
    if not r:
        r = make_redis_interface("CACHE")
    pickled_cookie = r.get(session_key % user_pk)
    if pickled_cookie:
        return pickle.loads(pickled_cookie)
def get_queue_length(queue_name="celery"):
    """Get the number of tasks in a celery queue.

    :param queue_name: The name of the queue you want to inspect.
    :return: the number of items in the queue.
    priority_names = [
        make_queue_name_for_pri(queue_name, pri)
        for pri in DEFAULT_PRIORITY_STEPS
    r = make_redis_interface("CELERY")
    return sum([r.llen(x) for x in priority_names])
Пример #11
def send_docket_alert(d_pk, since):
    """Send an alert for a given docket

    :param d_pk: The docket PK that was modified
    :param since: If we run alerts, notify users about items *since* this time.
    :return: None
    email_addresses = (User.objects.filter(
    if email_addresses:
        # We have an alert for this docket. Proceed.
        docket = Docket.objects.get(pk=d_pk)
        new_des = DocketEntry.objects.filter(date_created__gte=since,

        if new_des.count() > 0:
            # Notify every user that's subscribed to this alert.
            case_name = trunc(best_case_name(docket), 100, ellipsis="...")
            subject_template = loader.get_template("docket_alert_subject.txt")
            subject = subject_template.render({
                "docket": docket,
                "count": new_des.count(),
                "case_name": case_name,
            }).strip()  # Remove newlines that editors can insist on adding.
            email_context = {"new_des": new_des, "docket": docket}
            txt_template = loader.get_template("docket_alert_email.txt")
            html_template = loader.get_template("docket_alert_email.html")
            messages = []
            for email_address in email_addresses:
                msg = EmailMultiAlternatives(
                    headers={"X-Entity-Ref-ID": "docket.alert:%s" % d_pk},
                html = html_template.render(email_context)
                msg.attach_alternative(html, "text/html")

            # Add a bcc to the first message in the list so that we get a copy.
            messages[0].bcc = ["*****@*****.**"]
            connection = get_connection()
            tally_stat("alerts.docket.alerts.sent", inc=len(email_addresses))


    # Work completed, clear the semaphore
    r = make_redis_interface("ALERTS")
def add_or_update_case_db(self, case_id):
    """Add a case from the LASC MAP using an authenticated session object

    :param self: The celery object
    :param case_id: The case ID to download, for example, '19STCV25157;SS;CV'
    :return: None
    lasc = make_lasc_search()

    clean_data = {}
        clean_data = lasc.get_json_from_internal_case_id(case_id)
        logger.info("Successful Query")
    except RequestException as e:
        retries_remaining = self.max_retries - self.request.retries
        if retries_remaining == 0:
            logger.error("RequestException, unable to get case at %s", case_id)
            "Failed to get JSON for '%s', with RequestException: %s. "
            "%s retries remaining.",
        r = make_redis_interface("CACHE")

    if not clean_data:
        logger.info("No information for case %s. Possibly sealed?", case_id)

    ds = Docket.objects.filter(case_id=case_id)
    ds_count = ds.count()
    if ds_count == 0:
        logger.info("Adding lasc case with ID: %s", case_id)
        add_case(case_id, clean_data, lasc.case_data)
    elif ds_count == 1:
        if latest_sha(case_id=case_id) != sha1_of_json_data(lasc.case_data):
            logger.info("Updating lasc case with ID: %s", case_id)
            update_case(lasc, clean_data)
            logger.info("LASC case is already up to date: %s", case_id)
            "Issue adding or updating lasc case with ID '%s' - Too "
            "many cases in system with that ID (%s cases)",
def get_pacer_cookie_from_cache(user_pk: Union[str, int], r: Redis = None):
    """Get the cookie for a user from the cache.

    :param user_pk: The ID of the user, can be a string or an ID
    :param r: A redis interface. If not provided, a fresh one is used. This is
    a performance enhancement.
    :return Either None if no cache cookies or the cookies if they're found.
    if not r:
        r = make_redis_interface("CACHE", decode_responses=False)
    pickled_cookie = r.get(session_key % user_pk)
    if pickled_cookie:
        return pickle.loads(pickled_cookie)
def get_avg_ms_for_endpoint(endpoint, d):

    :param endpoint: The endpoint to get the average timing for. Typically
    something like 'docket-list' or 'docket-detail'
    :param d: The date to get the timing for (a date object)
    :return: The average number of ms that endpoint used to serve requests on
    that day.
    d_str = d.isoformat()
    r = make_redis_interface("STATS")
    pipe = r.pipeline()
    pipe.zscore("api:v3.endpoint.d:%s.timings" % d_str, endpoint)
    pipe.zscore("api:v3.endpoint.d:%s.counts" % d_str, endpoint)
    results = pipe.execute()

    return results[0] / results[1]
Пример #22
def get_task_wait(
    task: Task,
    rate: str = "1/s",
    key: str = None,
) -> float:
    """Keep a global throttle for tasks

    Can be used via the `throttle_task` decorator above.

    This implements the timestamp-based algorithm detailed here:


    Basically, you keep track of the number of requests and use the key
    expiration as a reset of the counter.

    So you have a rate of 5/m, and your first task comes in. You create a key:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    Another task comes in a few seconds later:

        celery_throttle:task_name = 2
        Do not update the ttl, it now has 58s remaining

    And so forth, until:

        celery_throttle:task_name = 6
        (10s remaining)

    We're over the threshold. Re-queue the task for later. 10s later:

        Key expires b/c no more ttl.

    Another task comes in:

        celery_throttle:task_name = 1
        celery_throttle:task_name.expires = 60

    And so forth.


    There is also a scheduler that figures out when to re-queue tasks. The idea
    of the scheduler is simple: If you know the rate the tasks can be
    processed, and if you're getting tasks faster than that rate, you can
    schedule each one to take its turn at a reasonable specified time. This is
    implemented by keeping a timestamp in redis indicating when the throttle
    will no longer be clogged up.

    Say you have a rate of 1/5s, and you get tasks as follows:

         Elapsed Time | Task Number
              1s      |     1
              2s      |     2
              3s      |     3

    Task number 1 runs immediately, but sets a throttle for five seconds until
    more work can be done. The second comes in and sees that the throttle has a
    ttl of three remaining seconds, so it waits that long. Next, task number 3
    comes in. It sees that the current window is full, and that the next one is
    too — only one task every five seconds, right? It has to wait seven
    seconds: two seconds (for the current window) *plus* 5 seconds (for the
    next one, which is occupied by task two).

    And so forth.

    :param task: The task that is being checked
    :param rate: How many times the task can be run during the time period.
    Something like, 1/s, 2/h or similar.
    :param key: If given, add this to the key placed in Redis for the item.
    Typically, this will correspond to the value of an argument passed to the
    throttled task.
    :return: If throttled returns a float of how many seconds the task should
    wait until the next open window for processing. If not throttled, returns
    zero (i.e., don't wait).
    task_sub_key = f"{task.name}{':' + str(key) if key else ''}"
    throttle_key = f"celery_throttle:{task_sub_key}"

    r = make_redis_interface("CACHE")

    allowed_task_count, duration = parse_rate(rate)

    # Check the count in redis
    actual_task_count = r.get(throttle_key)
    if actual_task_count is None:
        # No key. Set the value to 1 and set the ttl of the key.
        r.set(throttle_key, 1, ex=duration)
        return 0

    # Key found. Check if we should throttle.
    if int(actual_task_count) < allowed_task_count:
        # We're OK to run the task. Increment our counter, and say things are
        # OK by returning 0.
        new_count = r.incr(throttle_key, 1)
        if new_count == 1:
            # Safety check. If the count is 1 after incrementing, that means we
            # created the key via the incr command. This can happen when it
            # expires between when we `get` its value up above and when we
            # increment it here. If that happens, it lacks a ttl! Set one.
            # N.B. There's no need to worry about a race condition between our
            # incr above, and the `expire` line here b/c without a ttl on this
            # key, it can't expire between these two commands.
            r.expire(throttle_key, duration)
        return 0

    # Over the threshold. Find the next window and schedule the task.
    schedule_key = f"celery_throttle:schedule:{task_sub_key}"
    n = now()
    delay = r.get(schedule_key)
    if delay is None:
        # No schedule yet. Run the task when the current throttle expires.
        return set_for_next_window(r, throttle_key, schedule_key, n)

    # We have a delay, so use it if it's in the future
    delay = parser.parse(delay)
    if delay < n:
        # Delay is in the past. Run the task when the current throttle expires.
        return set_for_next_window(r, throttle_key, schedule_key, n)

    # Delay is in the future; use it and supplement it
    new_time = delay + timedelta(seconds=duration / allowed_task_count)
    r.set(schedule_key, str(new_time))
    return (new_time - n).total_seconds()
def invert_user_logs(start, end, add_usernames=True):
    """Invert the user logs for a period of time

    The user logs have the date in the key and the user as part of the set:

        'api:v3.user.d:2016-10-01.counts': {
           mlissner: 22,
           joe_hazard: 33,

    This inverts these entries to:

        users: {
            mlissner: {
                2016-10-01: 22,
                total: 22,
            joe_hazard: {
                2016-10-01: 33,
                total: 33,
    :param start: The beginning date (inclusive) you want the results for. A
    :type start: datetime.datetime
    :param end: The end date (inclusive) you want the results for.
    :type end: datetime.datetime
    :param add_usernames: Stats are stored with the user ID. If this is True,
    add an alias in the returned dictionary that contains the username as well.
    :type add_usernames: bool
    :return The inverted dictionary
    :rtype: dict
    r = make_redis_interface("STATS")
    pipe = r.pipeline()

    dates = [
        d.date().isoformat() for d in rrule(
    for d in dates:
        pipe.zrange("api:v3.user.d:%s.counts" % d, 0, -1, withscores=True)
    results = pipe.execute()

    # results is a list of results for each of the zrange queries above. Zip
    # those results with the date that created it, and invert the whole thing.
    out = defaultdict(dict)
    for d, result in zip(dates, results):
        for user_id, count in result:
            if user_id == "None" or user_id == "AnonymousUser":
                user_id = "AnonymousUser"
                user_id = int(user_id)
            count = int(count)
            if out.get(user_id):
                out[user_id][d] = count
                out[user_id]["total"] += count
                out[user_id] = {d: count, "total": count}

    # Sort the values
    for k, v in out.items():
        out[k] = OrderedDict(sorted(v.items(), key=lambda t: t[0]))

    # Add usernames as alternate keys for every value possible.
    if add_usernames:
        for k, v in out.items():
                user = User.objects.get(pk=k)
            except (User.DoesNotExist, ValueError):
                out[user.username] = v

    return out
