def query_and_export(options): """Iterate over the query list, place the queries, and then export results Our client has provided us with a spreadsheet chalk-full of queries. Our task is to take those queries, run them, identify the matched dockets, then serialize those dockets to disk as the deliverable for the client. :param options: The argparse options :return None """ f = options["file"] reader = csv.DictReader(f) d_pks = set() for i, row in enumerate(reader): if i < options["query_offset"]: continue if i >= options["query_limit"] > 0: break query_params = get_query_from_link(row["Link"]) logger.info("Doing query: %s", query_params) d_pks.update(query_dockets(query_params)) q = options["queue"] throttle = CeleryThrottle(queue_name=q) for i, d_pk in enumerate(d_pks): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 1000 == 0: logger.info("Doing item %s with pk %s", i, d_pk) throttle.maybe_wait() save_ia_docket_to_disk.apply_async( args=(d_pk, options["output_directory"]), queue=q, )
def do_bulk_export(options): """Save selected dockets from 2016 to disk This will serialize the items to disk using celery tasks and the IA serializer. """ q = options["queue"] offset = options["offset"] throttle = CeleryThrottle(queue_name=q) if offset > 0: logger.info("Skipping dockets with PK less than than %s", offset) d_pks = (Docket.objects.filter( court__jurisdiction=Court.FEDERAL_DISTRICT, pk__gt=offset, source__in=Docket.RECAP_SOURCES, date_filed__gte="2016-01-01", date_filed__lte="2016-12-31", ).order_by("pk").values_list("pk", flat=True)) for i, d_pk in enumerate(d_pks): if i >= options["limit"] > 0: break logger.info("Doing item %s with pk %s", i, d_pk) throttle.maybe_wait() save_ia_docket_to_disk.apply_async( args=(d_pk, options["output_directory"]), queue=q, )
def do_bulk_export(options): """The final step of this project is to bulk export an outrageous amount of bankruptcy data from our system. Limit/offset work differently than in many other functions. Limit is a true hard limit to the number that should get done. A limit of 10 means ten items will be done. Offset corresponds to the docket PK below which you do not want to process. (It does *not* correspond to the number of completed items.) """ q = options['queue'] offset = options['offset'] throttle = CeleryThrottle(queue_name=q) if offset > 0: logger.info("Skipping to dockets with PK greater than %s", offset) d_pks = Docket.objects.filter( court__jurisdiction=Court.FEDERAL_BANKRUPTCY, pk__gt=offset, ).order_by('pk').values_list('pk', flat=True) for i, d_pk in enumerate(d_pks): if i >= options['limit'] > 0: break logger.info("Doing item %s with pk %s", i, d_pk) throttle.maybe_wait() save_ia_docket_to_disk.apply_async( args=(d_pk, options['output_directory']), queue=q, )
def do_bulk_export(options): """Save selected dockets to disk This will serialize the items to disk using celery tasks and the IA serializer. """ q = options["queue"] throttle = CeleryThrottle(queue_name=q) for i, d_pk in enumerate(docket_pks_for_query(QUERY)): if i < options["offset"]: continue if i >= options["limit"] > 0: break if i % 1000 == 0: logger.info("Doing item %s with pk %s", i, d_pk) throttle.maybe_wait() save_ia_docket_to_disk.apply_async(args=(d_pk, options["output_directory"]), queue=q)