def extract_recap_documents( docs: QuerySet, skip_ocr: bool = False, order_by: Optional[str] = None, queue: Optional[str] = None, ) -> None: """Loop over RECAPDocuments and extract their contents. Use OCR if requested. :param docs: A queryset containing the RECAPDocuments to be processed. :type docs: Django Queryset :param skip_ocr: Whether OCR should be completed (False) or whether items should simply be updated to have status OCR_NEEDED. :type skip_ocr: Bool :param order_by: An optimization parameter. You may opt to order the processing by 'small-first' or 'big-first'. :type order_by: str :param queue: The celery queue to send the content to. :type queue: str """ docs = docs.exclude(filepath_local="") if skip_ocr: # Focus on the items that we don't know if they need OCR. docs = docs.filter(ocr_status=None) else: # We're doing OCR. Only work with those items that require it. docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) if order_by is not None: if order_by == "small-first": docs = docs.order_by("page_count") elif order_by == "big-first": docs = docs.order_by("-page_count") count = docs.count() throttle = CeleryThrottle(queue_name=queue) for i, pk in enumerate(docs.values_list("pk", flat=True)): throttle.maybe_wait() extract_recap_pdf.apply_async((pk, skip_ocr), priority=5, queue=queue) if i % 1000 == 0: msg = f"Sent {i + 1}/{count} tasks to celery so far." logger.info(msg) sys.stdout.write(f"\r{msg}") sys.stdout.flush()
def extract_recap_documents(docs, skip_ocr=False, order_by=None, queue=None): """Loop over RECAPDocuments and extract their contents. Use OCR if requested. :param docs: A queryset containing the RECAPDocuments to be processed. :type docs: Django Queryset :param skip_ocr: Whether OCR should be completed (False) or whether items should simply be updated to have status OCR_NEEDED. :type skip_ocr: Bool :param order_by: An optimization parameter. You may opt to order the processing by 'small-first' or 'big-first'. :type order_by: str :param queue: The celery queue to send the content to. :type queue: str """ docs = docs.exclude(filepath_local='') if skip_ocr: # Focus on the items that we don't know if they need OCR. docs = docs.filter(ocr_status=None) else: # We're doing OCR. Only work with those items that require it. docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) if order_by is not None: if order_by == 'small-first': docs = docs.order_by('page_count') elif order_by == 'big-first': docs = docs.order_by('-page_count') count = docs.count() throttle = CeleryThrottle(queue_name=queue) for i, pk in enumerate(docs.values_list('pk', flat=True)): throttle.maybe_wait() extract_recap_pdf.apply_async((pk, skip_ocr), priority=5, queue=queue) if i % 1000 == 0: msg = "Sent %s/%s tasks to celery so far." % (i + 1, count) logger.info(msg) sys.stdout.write("\r%s" % msg) sys.stdout.flush()