Exemplo n.º 1
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf
        assert rs.tables_extracted is False
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted
        assert rs.additional_info == 'hello world'

        text = client.get_plain_text(rs.request_id)
        for i in range(1, 22):
            assert f'This is page {i}' in text

        with client.get_pdf_as_local_file(rs.request_id) as tfn:
            with pikepdf.open(tfn) as pdf:
                assert len(pdf.pages) == 22

        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert text_struct.language == 'en'
        assert len(text_struct.pages) == 22
        assert len(text_struct.paragraphs) == 1
        assert len(text_struct.sentences) > 2

        log.info('Text extraction results look good. All assertions passed.')
Exemplo n.º 2
0
def deliver_error(request_id: str,
                  request_callback_info: RequestCallbackInfo,
                  problem: Optional[str] = None,
                  exc: Optional[Exception] = None):
    req: Optional[RequestMetadata] = None
    try:
        req = load_request_metadata(request_id)
        if not req:
            log.warning(
                f'{request_callback_info.original_file_name} | Not delivering error '
                f'because the request files do not exist in storage: '
                f'(#{request_id})\n'
                f'This usually means the request is canceled.')
            return
        req.status = STATUS_FAILURE

        if problem or exc:
            req.append_error(problem, exc)

        save_request_metadata(req)
    except Exception as req_upd_err:
        log.error(
            f'{request_callback_info.original_file_name} | Unable to store failed status into '
            f'metadata of request #{request_id}',
            exc_info=req_upd_err)
    req_status = RequestStatus(
        request_id=request_id,
        original_file_name=request_callback_info.original_file_name,
        status=STATUS_FAILURE,
        additional_info=request_callback_info.call_back_additional_info,
        output_format=req.output_format)
    deliver_results(request_callback_info, req_status)
Exemplo n.º 3
0
 def assert_func(rfile, headers):
     log.info('Text extraction results are ready...')
     rs: RequestStatus = RequestStatus.from_json(rfile)
     assert rs.status == 'DONE'
     assert os.path.basename(fn) == rs.original_file_name
     assert rs.pdf_pages_ocred
     assert rs.searchable_pdf_created
     log.info('Text extraction results look good. All assertions passed.')
Exemplo n.º 4
0
def deliver_results(req: RequestCallbackInfo, req_status: RequestStatus):
    if req.call_back_url:
        try:
            log.info(
                f'{req.original_file_name} | POSTing the extraction results to {req.call_back_url}...'
            )
            requests.post(req.call_back_url, json=req_status.to_dict())
        except Exception as err:
            log.error(
                f'{req.original_file_name} | Unable to POST the extraction results to {req.call_back_url}',
                exc_info=err)

    if req.call_back_celery_broker:
        try:
            log.info(
                f'{req.original_file_name} | Sending the extraction results as a celery task:\n'
                f'broker: {req.call_back_celery_broker}\n'
                f'queue: {req.call_back_celery_queue}\n'
                f'task_name: {req.call_back_celery_task_name}\n')
            send_task(broker_url=req.call_back_celery_broker,
                      queue=req.call_back_celery_queue,
                      task_name=req.call_back_celery_task_name,
                      task_kwargs=req_status.to_dict(),
                      task_id=req.call_back_celery_task_id,
                      parent_task_id=req.call_back_celery_parent_task_id,
                      root_task_id=req.call_back_celery_root_task_id,
                      celery_version=req.call_back_celery_version)
        except Exception as err:
            log.error(
                f'{req.original_file_name} | Unable to send the extraction results as a celery task:\n'
                f'broker: {req.call_back_celery_broker}\n'
                f'queue: {req.call_back_celery_queue}\n'
                f'task_name: {req.call_back_celery_task_name}\n',
                exc_info=err)

    status_extra = ', '.join([
        'plain text' if req_status.plain_text_extracted else '',
        'coords extracted' if req_status.pdf_coordinates_extracted else '',
        'pages OCRed' if req_status.pdf_pages_ocred else ''
    ])
    log.info(
        f'{req.original_file_name} | Finished processing request (#{req.request_id}). {status_extra}'
    )
Exemplo n.º 5
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.searchable_pdf_created
        assert not rs.tables_extracted
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        log.info('Text extraction results look good. All assertions passed.')
Exemplo n.º 6
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.searchable_pdf_created
        assert rs.tables_extracted
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        table_list_json: TableList = client.get_extracted_tables_as_msgpack(
            rs.request_id)
        assert len(table_list_json.tables) == 6

        log.info('Text extraction results look good. All assertions passed.')
 def to_request_status(self) -> RequestStatus:
     return RequestStatus(
         request_id=self.request_id,
         original_file_name=self.original_file_name,
         status=self.status,
         error_message=self.error_message,
         converted_cleaned_pdf=self.converted_to_pdf is not None,
         searchable_pdf_created=self.ocred_pdf is not None,
         corrected_pdf_created=self.corrected_pdf is not None,
         pdf_pages_ocred=self.pdf_pages_ocred,
         tables_extracted=self.tables_file is not None,
         plain_text_extracted=self.plain_text_file is not None,
         text_structure_extracted=self.text_structure_file is not None,
         pdf_coordinates_extracted=self.pdf_coordinates_file is not None,
         additional_info=self.request_callback_info.call_back_additional_info,
         output_format=self.output_format,
         page_rotate_angles=self.page_rotate_angles
     )
Exemplo n.º 8
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted
        assert rs.additional_info == 'hello world'

        text = client.get_plain_text(rs.request_id)
        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert len(text_struct.pages) == 4
        assert 'REPRODUCTION, AND DISTRIBUTION' in text  # page 1
        assert 'subsequently incorporated' in text  # page 2
        assert 'conditions stated in this License. ' in text  # page 3
        assert 'See the License for the specific language governing' in text  # page 4

        log.info('Text extraction results look good. All assertions passed.')
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.tables_extracted is False
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        text = client.get_plain_text(rs.request_id)

        with client.get_pdf_as_local_file(rs.request_id) as tfn:
            with pikepdf.open(tfn) as pdf:
                assert len(pdf.pages) == 1

        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert text_struct.language in ('en', 'ru')
        if text_struct.language == 'en':
            assert 'This is top secret' in text
            assert 'Top.' in text
            assert 'являлся Тор.' not in text
        elif text_struct.language == 'ru':
            assert 'This is top secret' not in text
            assert 'Top.' not in text
            assert 'являлся Тор.' in text
        assert len(text_struct.pages) == 1
        assert len(text_struct.paragraphs) == 1
        for i in text_struct.paragraphs:
            assert i.language == text_struct.language
        assert len(text_struct.sentences) == 3
        for i in text_struct.sentences:
            assert i.language == text_struct.language

        log.info('Text extraction results look good. All assertions passed.')
Exemplo n.º 10
0
 def assert_func(rfile, headers):
     log.info('Text extraction results are ready...')
     rs: RequestStatus = RequestStatus.from_json(rfile)
     assert rs.status == 'FAILURE'
     log.info('Text extraction results look good. All assertions passed.')
Exemplo n.º 11
0
 def get_data_extraction_task_status(self, request_id: str) -> RequestStatus:
     url = f'{self.base_url}/api/v1/data_extraction_tasks/{request_id}/status.json'
     resp = requests.get(url)
     self.raise_for_status(resp)
     return RequestStatus.from_json(resp.content)