def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options["queue"] index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( "Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name) )
def get_pdfs(options: OptionsType) -> None: """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = cast(str, options["queue"]) index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info(f"{task_name} {count} items from PACER.") throttle = CeleryThrottle(queue_name=q) completed = 0 for row in rows.iterator(): throttle.maybe_wait() c = chain( process_free_opinion_result.si( row.pk, row.court_id, cnt, ).set(queue=q), get_and_process_free_pdf.s(row.pk, row.court_id).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( f"Sent {completed}/{count} tasks to celery for {task_name} so far." )
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options['queue'] index = options['index'] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk') count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info("Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name))