def test_job_timeout(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" job = client.call( DocumentExtraction(files=[dataset_filepath], json_config='{"preset_config": "detailed"}'))[0] with pytest.raises(IndicoTimeoutError): job = client.call(JobStatus(id=job.id, wait=True, timeout=0.0))
def test_job_wait_on_failure(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call( DocumentExtraction(files=[dataset_filepath], json_config='{"preset_config": "wrong"}')) assert len(jobs) == 1 job = jobs[0] assert job.id != None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "FAILURE" assert type(job.result) == dict
def test_job_wait_on_success(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call( DocumentExtraction(files=[dataset_filepath], json_config='{"preset_config": "simple"}')) assert len(jobs) == 1 job = jobs[0] assert job.id != None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready == True assert type(job.result["url"]) == str
def test_document_extraction(indico): client = IndicoClient() dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf" jobs = client.call(DocumentExtraction(files=[dataset_filepath])) assert len(jobs) == 1 job = jobs[0] assert job.id is not None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready is True assert type(job.result["url"]) == str extract = client.call(RetrieveStorageObject(job.result)) assert type(extract) == dict assert "pages" in extract
def test_document_extraction_batched(indico): client = IndicoClient() file_names = ["mock.pdf", "mock_2.pdf", "mock_3.pdf"] parent_path = str(Path(__file__).parent.parent / "data") dataset_filepaths = [ os.path.join(parent_path, file_name) for file_name in file_names ] jobs = client.call( DocumentExtraction( files=dataset_filepaths, json_config={"preset_config": "simple"}, upload_batch_size=1, ) ) assert len(jobs) == 3 for job in jobs: assert job.id is not None job = client.call(JobStatus(id=job.id, wait=True)) assert job.status == "SUCCESS" assert job.ready is True assert isinstance(job.result["url"], str)
""" Example demonstrating how to OCR a document and access the text at the document, page, and block (or paragraph) level. """ from indico import IndicoClient, IndicoConfig from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject # Get the OCR object my_config = IndicoConfig(host="app.indico.io", api_token_path=".path/to/indico_api_token.txt") client = IndicoClient(config=my_config) files_to_extract = client.call( DocumentExtraction(files=["./test_paragraphs.pdf"], json_config={"preset_config": "standard"})) extracted_file = client.call(JobStatus(id=files_to_extract[0].id, wait=True)) json_result = client.call(RetrieveStorageObject(extracted_file.result)) # The code below shows how to get the OCR text from the 'json_result' object. # Note: it may vary slightly if you use DocumentExtraction configurations other than 'standard' # Full Text full_document_text = json_result["text"] # Doucment Text split by page text_by_page = list() for page in json_result["pages"]: text_by_page.append(page["text"]) # Document Text split by block (or paragraph)
from indico import IndicoClient, IndicoConfig from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject # Create an Indico API client my_config = IndicoConfig(host="app.indico.io", api_token_path="./path/to/indico_api_token.txt") client = IndicoClient(config=my_config) # OCR a single file and wait for it to complete job = client.call( DocumentExtraction(files=["./path_to_doc.pdf"], json_config=dict(preset_config="ondocument"))) extracted_file = client.call(JobStatus(id=job[0].id, wait=True)) if extracted_file.status == "SUCCESS": result = client.call(RetrieveStorageObject(extracted_file.result)) print(result)