def OCRPDF(task): task_tag = "PDF OCR-TO-TEXT" print "\n\n************** %s [START] ******************\n" % task_tag print "OCRing text from pdf at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) total_pages = pdf_reader.getNumPages() if hasattr(task, "split_file"): pdf_reader = pdf.loadAsset(task.split_file) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return lower_bound = 0 upper_bound = lower_bound + pdf_reader.getNumPages() texts = [None] * total_pages for x in xrange(lower_bound, upper_bound): # TODO: OCR the doc texts[x] = "TBD" asset_path = pdf.addAsset(texts, "doc_ocr.json", as_literal=False, description="jsonified texts in document; page-by-page. From OCR", tags=[ASSET_TAGS['TXT_OCR']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.addCompletedTask(task.task_path) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def extractPDFText(task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.cp_pdf import CompassPDF from conf import DEBUG from vars import ASSET_TAGS pdf = CompassPDF(_id=task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ pdf_reader = pdf.loadFile(pdf.file_name) total_pages = pdf_reader.getNumPages() if hasattr(task, "split_file"): pdf_reader = pdf.loadAsset(task.split_file) if pdf_reader is None: print "PDF READER IS NONE" print "\n\n************** PDF TEXT EXTRACTION [ERROR] ******************\n" return from json import loads lower_bound = 0 t = pdf.getAsset("doc_texts.json") if t is None: texts = [None] * total_pages else: try: texts = loads(t[0]) except TypeError as e: texts = [None] * total_pages if hasattr(task, "split_index") : lower_bound = task.split_index upper_bound = lower_bound + pdf_reader.getNumPages() for x in xrange(lower_bound, upper_bound): texts[x] = pdf_reader.getPage(x).extractText() if DEBUG: print "EXTRACTED TEXT from page %d of %d:\n%s" % (x, upper_bound, texts[x]) asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. uncleaned. (Not OCR)", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : pdf._id, 'searchable_text' : texts, 'file_name' : asset_path }) pdf.text_id = uv_text._id pdf.save() pdf.addCompletedTask(task.task_path) if not hasattr(task, "no_continue"): from lib.Worker.Models.uv_task import UnveillanceTask next_task = UnveillanceTask(inflate={ 'task_path' : 'Text.preprocess_nlp.preprocessNLP', 'doc_id' : task.doc_id, 'queue' : task.queue, 'text_file' : asset_path }) next_task.run() if DEBUG: print "WHERE ARE THE F*****G S TEXTS? %d" % len(pdf.searchable_texts) task.finish() print "\n\n************** PDF TEXT EXTRACTION [END] ******************\n"
def extractPDFText(uv_task): task_tag = "PDF TEXT EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "extracting text from pdf at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.cp_pdf import CompassPDF pdf = CompassPDF(_id=uv_task.doc_id) if pdf is None: print "PDF IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return """ In this task, we might be asked to extract from a broken-up sub-group of documents. if so, that should be set in the task's properties. """ import os from fabric.api import settings, local from wand.image import Image from time import sleep from lib.Core.Utils.funcs import cleanLine, generateMD5Hash from Models.uv_els_stub import UnveillanceELSStub from conf import ANNEX_DIR, DEBUG from vars import ASSET_TAGS texts = [None] * pdf.total_pages if pdf.hasParts(): extractors = pdf.getParts() else: extractors = [pdf.file_name] count = 0 for e in extractors: if e == pdf.file_name: pdf_reader = pdf.loadFile(e) else: pdf_reader = pdf.loadAsset(e) try: num_pages = pdf_reader.getNumPages() except AttributeError as e: print e continue for x in xrange(0, num_pages): text = cleanLine(pdf_reader.getPage(x).extractText()) texts[count] = text els_stub = UnveillanceELSStub('cp_page_text', inflate={ 'media_id' : pdf._id, 'searchable_text' : text, 'index_in_parent' : count, '_id' : generateMD5Hash(content=pdf._id, salt=str(count)) }) count += 1 asset_path = pdf.addAsset(texts, "doc_texts.json", as_literal=False, description="jsonified texts in document; page-by-page, segment-by-segment. unclean.", tags=[ASSET_TAGS['TXT_JSON']]) if asset_path is not None: pdf.addFile(asset_path, None, sync=True) pdf.save() del texts pdf.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate={ 'text_file' : asset_path }) print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()