def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None: sys.stdout.write("Graph size is {0:d} nodes.\n".format(self.count)) sys.stdout.flush() index_during_subtask = False if self.index == "concurrently": index_during_subtask = True chunk = [] chunk_size = 100 processed_count = 0 throttle = CeleryThrottle(queue_name=queue_name) for opinion_pk in opinion_pks: throttle.maybe_wait() processed_count += 1 last_item = self.count == processed_count chunk.append(opinion_pk) if processed_count % chunk_size == 0 or last_item: find_citations_for_opinion_by_pks.apply_async( args=(chunk, index_during_subtask), queue=queue_name, ) chunk = [] self.log_progress(processed_count, opinion_pk)
def extract_doc_content(pk, do_ocr=False, citation_jitter=False): """ Given an opinion PK, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. This implementation uses local paths. :param pk: The opinion primary key to work on :param do_ocr: Whether the PDF converting function should use OCR :param citation_jitter: Whether to apply jitter before running the citation parsing code. This can be useful do spread these tasks out when doing a larger scrape. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split(".")[-1] if extension == "doc": content, err = extract_from_doc(path) elif extension == "docx": content, err = extract_from_docx(path) elif extension == "html": content, err = extract_from_html(path) elif extension == "pdf": content, err = extract_from_pdf(path, opinion, do_ocr) elif extension == "txt": content, err = extract_from_txt(path) elif extension == "wpd": content, err = extract_from_wpd(path, opinion) else: print("*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion)) return assert isinstance( content, str), "content must be of type str, not %s" % type(content) # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ["html", "wpd"]: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() update_document_from_text(opinion) if err: print(err) print("****Error extracting text from %s: %s****" % (extension, opinion)) return # Save item, and index Solr if needed. # noinspection PyBroadException try: opinion.cluster.docket.save() opinion.cluster.save(index=False) if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return # Identify and link citations within the document content find_citations_for_opinion_by_pks.apply_async( ([opinion.pk], ), countdown=random.randint(0, 3600))
def extract_doc_content( pk: int, ocr_available: bool = False, citation_jitter: bool = False, ) -> None: """ Given an opinion PK, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. This implementation uses local paths. :param pk: The opinion primary key to work on :param ocr_available: Whether the PDF converting function should use OCR :param citation_jitter: Whether to apply jitter before running the citation parsing code. This can be useful do spread these tasks out when doing a larger scrape. """ opinion = Opinion.objects.get(pk=pk) extension = opinion.local_path.name.split(".")[-1] with NamedTemporaryFile( prefix="extract_file_", suffix=f".{extension}", buffering=0, # Make sure it's on disk when we try to use it ) as tmp: # Get file contents from S3 and put them in a temp file. tmp.write(opinion.local_path.read()) if extension == "doc": content, err = extract_from_doc(tmp.name) elif extension == "docx": content, err = extract_from_docx(tmp.name) elif extension == "html": content, err = extract_from_html(tmp.name) elif extension == "pdf": content, err = extract_from_pdf(tmp.name, opinion, ocr_available) elif extension == "txt": content, err = extract_from_txt(tmp.name) elif extension == "wpd": content, err = extract_from_wpd(tmp.name, opinion) else: print( "*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion)) return # Do page count, if possible opinion.page_count = get_page_count(tmp.name, extension) assert isinstance(content, str), f"content must be of type str, not {type(content)}" set_blocked_status(opinion, content, extension) update_document_from_text(opinion) if err: print(err) print(f"****Error extracting text from {extension}: {opinion}****") return # Save item, and index Solr if needed. # noinspection PyBroadException try: opinion.cluster.docket.save() opinion.cluster.save(index=False) if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return # Identify and link citations within the document content find_citations_for_opinion_by_pks.apply_async( ([opinion.pk], ), countdown=random.randint(0, 3600))
def extract_doc_content(pk, do_ocr=False, citation_jitter=False): """ Given an opinion PK, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. This implementation uses local paths. :param pk: The opinion primary key to work on :param do_ocr: Whether the PDF converting function should use OCR :param citation_jitter: Whether to apply jitter before running the citation parsing code. This can be useful do spread these tasks out when doing a larger scrape. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path) elif extension == 'docx': content, err = extract_from_docx(path) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': content, err = extract_from_pdf(path, opinion, do_ocr) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': content, err = extract_from_wpd(path, opinion) else: print ('*****Unable to extract content due to unknown extension: %s ' 'on opinion: %s****' % (extension, opinion)) return # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ['html', 'wpd']: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return # Save item, and index Solr if needed. # noinspection PyBroadException try: if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return # Identify and link citations within the document content find_citations_for_opinion_by_pks.apply_async( ([opinion.pk],), countdown=random.randint(0, 3600) )