Пример #1
0
async def extract_all_data_from_document(
        file: UploadFile = File(...),
        doc_language: str = Form(default=''),
        convert_to_pdf_timeout_sec: int = Form(default=1800),
        pdf_to_images_timeout_sec: int = Form(default=1800),
        full_extract_timeout_sec: int = Form(default=3600),
        char_coords_debug_enable: bool = Form(default=False),
        output_format: OutputFormat = Form(default=OutputFormat.json),
):
    webdav_client = get_webdav_client()
    request_id = str(uuid4())
    _run_sync_pdf_processing(webdav_client, request_id, file, doc_language, convert_to_pdf_timeout_sec,
                             pdf_to_images_timeout_sec, char_coords_debug_enable, output_format)

    # Wait until celery finishes extracting else return TimeoutError
    if not _wait_for_pdf_extraction_finish(request_id, full_extract_timeout_sec):
        await purge_data_extraction_task(request_id)
        raise HTTPException(status_code=504, detail="Input file is too big")

    # Get all extracted data in .zip file and clean temp data
    req: RequestMetadata = load_request_metadata_or_raise(request_id)
    if req.status != dto.STATUS_DONE:
        raise HTTPException(status_code=500, detail=f'Request is not finished successfully.\n'
                                                    f'Status: {req.status}.\n'
                                                    f'Detail:\n{req.error_message}')
    mem_stream = pack_request_results(req)
    mem_stream.seek(0)
    response = StreamingResponse(mem_stream, media_type='application/x-zip-compressed')
    response.headers['Content-Disposition'] = 'attachment; filename=packed_data.zip'
    webdav_client.clean(f'{req.request_id}/')
    return response
Пример #2
0
async def extract_plain_text_from_document(
        file: UploadFile = File(...),
        doc_language: str = Form(default=''),
        convert_to_pdf_timeout_sec: int = Form(default=1800),
        pdf_to_images_timeout_sec: int = Form(default=1800),
        full_extract_timeout_sec: int = Form(default=3600),
        char_coords_debug_enable: bool = Form(default=False),
        output_format: OutputFormat = Form(default=OutputFormat.json),
):
    webdav_client = get_webdav_client()
    request_id = str(uuid4())
    _run_sync_pdf_processing(webdav_client, request_id, file, doc_language, convert_to_pdf_timeout_sec,
                             pdf_to_images_timeout_sec, char_coords_debug_enable, output_format)

    # Wait until celery finishes extracting else return TimeoutError
    if not _wait_for_pdf_extraction_finish(request_id, full_extract_timeout_sec):
        await purge_data_extraction_task(request_id)
        raise HTTPException(status_code=504, detail="Input file is too big")

    # Get extracted plain text and clean temp data
    plain_text = _proxy_request(webdav_client, request_id,
                                load_request_metadata(request_id).plain_text_file,
                                headers={'Content-Type': 'text/plain; charset=utf-8'})
    webdav_client.clean(f'{request_id}/')
    return plain_text
Пример #3
0
def process_document(task, request_id: str,
                     request_callback_info: Dict[str, Any]) -> bool:
    request_callback_info = RequestCallbackInfo(**request_callback_info)
    with handle_errors(request_id, request_callback_info):
        webdav_client: WebDavClient = get_webdav_client()
        req: RequestMetadata = load_request_metadata(request_id)
        if not req:
            log.warning(
                f'{request_callback_info.original_file_name} | Canceling document processing (#{request_id}):\n'
                f'Request files do not exist. Probably the request was already canceled.\n'
            )
            return False
        log.info(
            f'{request_callback_info.original_file_name} | Starting text/data extraction '
            f'for request #{request_id}\n')
        with webdav_client.get_as_local_fn(
                f'{request_id}/{req.original_document}') as (fn, _remote_path):
            ext = os.path.splitext(fn)[1]
            if ext and ext.lower() == '.pdf':
                process_pdf(fn, req, webdav_client)
            else:
                log.info(f'{req.original_file_name} | Converting to PDF...')
                with convert_to_pdf(fn, timeout_sec=req.convert_to_pdf_timeout_sec) \
                        as local_converted_pdf_fn:
                    req.converted_to_pdf = os.path.splitext(
                        req.original_document)[0] + '.converted.pdf'
                    webdav_client.upload_file(
                        remote_path=f'{request_id}/{req.converted_to_pdf}',
                        local_path=local_converted_pdf_fn)
                    save_request_metadata(req)
                    process_pdf(local_converted_pdf_fn, req, webdav_client)
        return True
Пример #4
0
async def post_data_extraction_task(file: UploadFile = File(...),
                                    call_back_url: str = Form(default=None),
                                    call_back_celery_broker: str = Form(default=None),
                                    call_back_celery_task_name: str = Form(default=None),
                                    call_back_celery_queue: str = Form(default=None),
                                    call_back_additional_info: str = Form(default=None),
                                    call_back_celery_task_id: str = Form(default=None),
                                    call_back_celery_parent_task_id: str = Form(default=None),
                                    call_back_celery_root_task_id: str = Form(default=None),
                                    call_back_celery_version: int = Form(default=4),
                                    doc_language: str = Form(default=''),
                                    ocr_enable: bool = Form(default=True),
                                    table_extraction_enable: bool = Form(default=True),
                                    deskew_enable: bool = Form(default=True),
                                    char_coords_debug_enable: bool = Form(default=False),
                                    request_id: str = Form(default=None),
                                    log_extra_json_key_value: str = Form(default=None),
                                    convert_to_pdf_timeout_sec: int = Form(default=1800),
                                    pdf_to_images_timeout_sec: int = Form(default=1800),
                                    output_format: OutputFormat = Form(default=OutputFormat.json)):
    webdav_client = get_webdav_client()
    request_id = get_valid_fn(request_id) if request_id else str(uuid4())
    log_extra = json.loads(log_extra_json_key_value) if log_extra_json_key_value else None
    req = RequestMetadata(original_file_name=file.filename,
                          original_document=get_valid_fn(file.filename),
                          request_id=request_id,
                          request_date=datetime.now(),
                          doc_language=doc_language,
                          ocr_enable=ocr_enable,
                          table_extraction_enable=table_extraction_enable,
                          deskew_enable=deskew_enable,
                          char_coords_debug_enable=char_coords_debug_enable,
                          output_format=output_format,
                          convert_to_pdf_timeout_sec=convert_to_pdf_timeout_sec,
                          pdf_to_images_timeout_sec=pdf_to_images_timeout_sec,
                          request_callback_info=RequestCallbackInfo(
                              request_id=request_id,
                              original_file_name=file.filename,
                              call_back_url=call_back_url,
                              call_back_celery_broker=call_back_celery_broker,
                              call_back_celery_queue=call_back_celery_queue,
                              call_back_celery_task_name=call_back_celery_task_name,
                              call_back_additional_info=call_back_additional_info,
                              call_back_celery_task_id=call_back_celery_task_id,
                              call_back_celery_parent_task_id=call_back_celery_parent_task_id,
                              call_back_celery_root_task_id=call_back_celery_root_task_id,
                              call_back_celery_version=call_back_celery_version,
                              log_extra=log_extra))
    webdav_client.mkdir(f'/{req.request_id}')

    save_request_metadata(req)
    webdav_client.upload_to(file.file, f'{req.request_id}/{req.original_document}')
    async_task = process_document.apply_async(
        (req.request_id, req.request_callback_info.to_dict()))

    webdav_client.mkdir(f'{req.request_id}/{task_ids}')
    register_task_id(webdav_client, req.request_id, async_task.id)

    return req.request_id
def load_request_metadata(request_id) -> Optional[RequestMetadata]:
    try:
        webdav_client = get_webdav_client()
        buf = BytesIO()
        webdav_client.download_from(buf, f'{request_id}/{metadata_fn}')
        return RequestMetadata.from_json(buf.getvalue())
    except (RemoteParentNotFound, RemoteResourceNotFound):
        return None
Пример #6
0
def pack_request_results(req: RequestMetadata) -> io.BytesIO:
    temp_dir = mkdtemp()
    try:
        files = [f'/{req.request_id}/{f}' for f in [req.plain_text_file, req.text_structure_file,
                                                    req.tables_file, req.pdf_coordinates_file,
                                                    req.pdf_file] if f]
        get_webdav_client().download_files(files, temp_dir)
        mem_stream = io.BytesIO()
        with zipfile.ZipFile(mem_stream, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for name_only in os.listdir(temp_dir):
                fn = os.path.join(temp_dir, name_only)
                if not os.path.isfile(fn):
                    continue
                zip_file.write(fn, arcname=name_only)
            zip_file.writestr(zinfo_or_arcname='status.json', data=req.to_request_status().to_json())
        return mem_stream
    finally:
        rmtree(temp_dir)
Пример #7
0
async def purge_data_extraction_task(request_id: str):
    problems = dict()
    success = list()
    celery_task_ids: List[str] = get_request_task_ids(get_webdav_client(), request_id)
    for task_id in celery_task_ids:
        try:
            celery_app.control.revoke(task_id, terminate=True)
            success.append(task_id)
        except Exception as ex:
            problems[task_id] = HumanReadableTraceBackException \
                .from_exception(ex) \
                .human_readable_format()
    try:
        get_webdav_client().clean(f'{request_id}/')
    except RemoteResourceNotFound:
        problems[''] = f'Request "{request_id}" is not instantiated on WebDAV'

    return TaskCancelResult(request_id=request_id,
                            task_ids=celery_task_ids,
                            successfully_revoked=success,
                            problems=problems).to_dict()
Пример #8
0
def process_pdf_page_task(_task,
                          request_id: str,
                          original_file_name: str,
                          pdf_page_base_fn: str,
                          page_number: int,
                          ocr_language: str,
                          log_extra: Dict[str, str] = None):
    set_log_extra(log_extra)
    webdav_client = get_webdav_client()
    req = load_request_metadata(request_id)
    if not req:
        log.warning(
            f'{original_file_name} | Could not process pdf page {page_number}: {pdf_page_base_fn}.\n'
            f'Request files do not exist at webdav storage.\n'
            f'Probably the request was already canceled.\n'
            f'(#{request_id})')
        return None
    if req.status != STATUS_PENDING:
        log.info(
            f'{original_file_name} | Canceling pdf page processing sub-task for page {page_number}:'
            f' {pdf_page_base_fn} (request #{request_id})\n'
            f'because the request is already in status {req.status}.')
        return None
    log.info(f'{original_file_name} | Processing PDF page {page_number}...')
    try:
        with webdav_client.get_as_local_fn(f'{req.request_id}/{pages_for_processing}/{pdf_page_base_fn}') \
                as (local_pdf_page_fn, _remote_path):
            with process_pdf_page(
                    local_pdf_page_fn,
                    page_num=page_number,
                    ocr_enabled=req.ocr_enable,
                    ocr_language=ocr_language
            ) as page_proc_res:  # type: PDFPageProcessingResults
                if page_proc_res.page_requires_ocr:
                    webdav_client.upload_file(
                        remote_path=f'{req.request_id}'
                        f'/{pages_ocred}'
                        f'/{page_num_to_fn(page_number)}.pdf',
                        local_path=page_proc_res.ocred_page_fn)
    except Exception as e:
        raise Exception(
            f'{original_file_name} |  Exception caught while processing '
            f'PDF page {page_number}: {pdf_page_base_fn}') from e

    return page_number
def re_schedule_unknown_pending_tasks(log: Logger, app) -> List[Tuple[str, str]]:
    conf = get_settings()
    webdav_client = get_webdav_client()
    broker_url = conf.celery_broker
    if not broker_url.startswith('redis:'):
        raise Exception('Only Redis broker supported for the task health tracking.')
    restarted_tasks: List[Tuple[str, str]] = list()
    failed_to_restart_tasks: List[Tuple[str, str]] = list()
    start_time = datetime.now()
    unknown_pending_tasks = get_unknown_pending_tasks(app)
    for task_id in unknown_pending_tasks:
        task_name: Optional[str] = 'unknown'
        try:
            task_info: Dict = webdav_client.unpickle(remote_path=f'{tasks_pending}/{task_id}')
            task_name = task_info['headers']['task'] or 'unknown'

            with Connection(broker_url) as conn:
                producer = conn.Producer(serializer='json')
                producer.publish(task_info['body'],
                                 routing_key=task_info['routing_key'],
                                 delivery_mode=2,
                                 serializer='pickle',
                                 headers=task_info['headers'],
                                 exchange=task_info['exchange'],
                                 retry=task_info['retry_policy'] is not None,
                                 retry_policy=task_info['retry_policy'])
                restarted_tasks.append((task_id, task_name))
        except RemoteResourceNotFound:
            log.warning(f'Unable to restart lost pending task '
                        f'because it has been completed already: #{task_id} - {task_name}')
        except Exception as ex:
            failed_to_restart_tasks.append((task_id, task_name))
            log.error(f'Unable to restart lost pending task: #{task_id} - {task_name}', exc_info=ex)
    if unknown_pending_tasks:
        time_spent = datetime.now() - start_time
        msg = f'Found {len(unknown_pending_tasks)} and restarted {len(restarted_tasks)} unknown/lost tasks ' \
              f'registered at Webdav but not found in Redis queue.\n' \
              f'Time spent: {time_spent}\n'
        if restarted_tasks:
            msg += f'Restarted tasks:\n' + '\n'.join([' - '.join(item) for item in restarted_tasks])
        if failed_to_restart_tasks:
            msg += f'Failed to restart tasks:\n' + '\n'.join([' - '.join(item) for item in failed_to_restart_tasks])
        log.info(msg)
    return restarted_tasks
def store_pending_task_info_in_webdav(body,
                                      exchange,
                                      routing_key,
                                      headers,
                                      properties,
                                      declare,
                                      retry_policy):
    if routing_key == queue_celery_beat:
        # don't track Celery Beat tasks as they are not going through the default queue in Redis
        return

    # structures are described here:
    # https://docs.celeryproject.org/en/stable/internals/protocol.html#message-protocol-task-v2
    webdav_client = get_webdav_client()
    task_info = dict(exchange=exchange,
                     routing_key=routing_key,
                     headers=headers,
                     body=body,
                     retry_policy=retry_policy,
                     properties=properties)
    task_id = headers['id']
    webdav_client.pickle(obj=task_info, remote_path=f'{tasks_pending}/{task_id}')
def save_request_metadata(req: RequestMetadata):
    webdav_client = get_webdav_client()
    webdav_client.upload_to(req.to_json(indent=2).encode('utf-8'), f'{req.request_id}/{metadata_fn}')
Пример #12
0
async def get_extracted_plain_text(request_id: str):
    return _proxy_request(get_webdav_client(),
                          request_id,
                          load_request_metadata_or_raise(request_id).plain_text_file,
                          headers={'Content-Type': 'text/plain; charset=utf-8'})
Пример #13
0
def finish_pdf_processing(task, ocred_page_nums: List[int], request_id: str,
                          original_file_name: str,
                          req_callback_info: Dict[str, Any]):
    req_callback_info = RequestCallbackInfo(**req_callback_info)
    with handle_errors(request_id, req_callback_info):
        req: RequestMetadata = load_request_metadata(request_id)
        if not req:
            log.info(
                f'{original_file_name} | Not re-combining pdf blocks and not '
                f'processing the data extraction for request {request_id}.\n'
                f'Request files do not exist. Probably the request was already canceled.'
            )
            return False
        log.info(
            f'{req.original_file_name} | Re-combining pdf blocks ({ocred_page_nums}) and '
            f'processing the data extraction for request #{request_id}')
        webdav_client: WebDavClient = get_webdav_client()
        if req.status != STATUS_PENDING or not webdav_client.is_dir(
                f'{req.request_id}/{pages_for_processing}'):
            log.info(
                f'{req.original_file_name} | Request is already processed/failed/canceled (#{request_id})'
            )
            return
        temp_dir = tempfile.mkdtemp()
        try:
            pages_dir = os.path.join(temp_dir, 'pages')
            os.mkdir(pages_dir)

            requires_page_merge: bool = False

            # download PDFs of the OCRed pages
            # each page contains a transparent layer (glyphless font) with the recognized text
            # file names of the pages at webdav are generated in process_pdf_page_task(..) as:
            # <page_num>.pdf

            pdf_pages_ocred: List[int] = list()

            for remote_base_fn in webdav_client.list(
                    f'{request_id}/{pages_ocred}'):
                remote_page_pdf_fn = f'{req.request_id}/{pages_ocred}/{remote_base_fn}'
                local_page_pdf_fn = os.path.join(pages_dir, remote_base_fn)
                webdav_client.download_file(remote_page_pdf_fn,
                                            local_page_pdf_fn)
                pdf_pages_ocred.append(int(
                    os.path.splitext(remote_base_fn)[0]))
                requires_page_merge = True

            if requires_page_merge:
                req.pdf_pages_ocred = pdf_pages_ocred
                original_pdf_in_storage = req.converted_to_pdf or req.original_document
                local_orig_pdf_fn = os.path.join(temp_dir,
                                                 original_pdf_in_storage)

                webdav_client.download_file(
                    f'{req.request_id}/{original_pdf_in_storage}',
                    local_orig_pdf_fn)

                # merge-in the OCRed pages into the original PDF by adding them as layers on the original pages
                # merge_pdf_pages() expects the page PDF in pages_dir to be named as
                # either <page_num>.<rotation_angle>.pdf
                # or <page_num>.pdf
                with merge_pdf_pages(local_orig_pdf_fn,
                                     pages_dir) as local_merged_pdf_fn:
                    req.ocred_pdf = os.path.splitext(
                        original_pdf_in_storage)[0] + '.ocred.pdf'
                    webdav_client.upload_file(
                        f'{req.request_id}/{req.ocred_pdf}',
                        local_merged_pdf_fn)
                    extract_data_and_finish(req, webdav_client,
                                            local_merged_pdf_fn)
            else:
                remote_fn = req.converted_to_pdf or req.original_document
                with webdav_client.get_as_local_fn(
                        f'{req.request_id}/{remote_fn}') as (local_pdf_fn,
                                                             _remote_path):
                    extract_data_and_finish(req, webdav_client, local_pdf_fn)

        finally:
            shutil.rmtree(temp_dir)
Пример #14
0
async def get_extracted_text_structure_as_msgpack(request_id: str):
    return _proxy_request(get_webdav_client(),
                          request_id,
                          load_request_metadata_or_raise(request_id).text_structure_file)
def get_pending_tasks_from_webdav() -> Set[str]:
    webdav = get_webdav_client()
    return set(webdav.list(remote_path=tasks_pending, get_info=False))
def remove_pending_task_info_from_webdav(task_id: str, task_name: str):
    try:
        get_webdav_client().clean(f'{tasks_pending}/{task_id}')
    except RemoteResourceNotFound:
        pass
def init_task_tracking(*args, **kwargs):
    get_webdav_client().mkdir(tasks_pending)
Пример #18
0
async def get_extracted_tables_as_json(request_id: str):
    return _proxy_request(get_webdav_client(), request_id, load_request_metadata_or_raise(request_id).tables_file)
Пример #19
0
async def get_pdf_coordinates_of_each_character_in_extracted_plain_text_as_msgpack(request_id: str):
    return _proxy_request(get_webdav_client(),
                          request_id,
                          load_request_metadata_or_raise(request_id).pdf_coordinates_file)
Пример #20
0
async def get_searchable_pdf(request_id: str):
    return _proxy_request(get_webdav_client(),
                          request_id,
                          load_request_metadata_or_raise(request_id).pdf_file)
Пример #21
0
async def delete_request_files(request_id: str):
    try:
        get_webdav_client().clean(f'{request_id}/')
    except RemoteResourceNotFound:
        raise HTTPException(HTTP_404_NOT_FOUND, 'No such data extraction request')