Python DocumentExtraction примеры использования

Язык программирования: Python

Пространство имен/Пакет: indico.queries

Класс/Тип: DocumentExtraction

Примеров на hotexamples.com: 7

Python DocumentExtraction - 7 примеров найдено. Это лучшие примеры Python кода для indico.queries.DocumentExtraction, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DocumentExtraction(7)

Основные методы

DocumentExtraction (7)

Пример #1

Показать файл

Файл: test_job.py Проект: IndicoDataSolutions/indico-client-python

def test_job_timeout(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"
    job = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "detailed"}'))[0]
    with pytest.raises(IndicoTimeoutError):
        job = client.call(JobStatus(id=job.id, wait=True, timeout=0.0))

Пример #2

Показать файл

Файл: test_job.py Проект: IndicoDataSolutions/indico-client-python

def test_job_wait_on_failure(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "wrong"}'))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id != None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "FAILURE"
    assert type(job.result) == dict

Пример #3

Показать файл

Файл: test_job.py Проект: IndicoDataSolutions/indico-client-python

def test_job_wait_on_success(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(
        DocumentExtraction(files=[dataset_filepath],
                           json_config='{"preset_config": "simple"}'))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id != None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready == True
    assert type(job.result["url"]) == str

Пример #4

Показать файл

def test_document_extraction(indico):
    client = IndicoClient()
    dataset_filepath = str(Path(__file__).parents[1]) + "/data/mock.pdf"

    jobs = client.call(DocumentExtraction(files=[dataset_filepath]))

    assert len(jobs) == 1
    job = jobs[0]
    assert job.id is not None
    job = client.call(JobStatus(id=job.id, wait=True))
    assert job.status == "SUCCESS"
    assert job.ready is True
    assert type(job.result["url"]) == str

    extract = client.call(RetrieveStorageObject(job.result))

    assert type(extract) == dict
    assert "pages" in extract

Пример #5

Показать файл

def test_document_extraction_batched(indico):
    client = IndicoClient()
    file_names = ["mock.pdf", "mock_2.pdf", "mock_3.pdf"]
    parent_path = str(Path(__file__).parent.parent / "data")
    dataset_filepaths = [
        os.path.join(parent_path, file_name) for file_name in file_names
    ]

    jobs = client.call(
        DocumentExtraction(
            files=dataset_filepaths,
            json_config={"preset_config": "simple"},
            upload_batch_size=1,
        )
    )
    assert len(jobs) == 3
    for job in jobs:
        assert job.id is not None
        job = client.call(JobStatus(id=job.id, wait=True))
        assert job.status == "SUCCESS"
        assert job.ready is True
        assert isinstance(job.result["url"], str)

Пример #6

Показать файл

"""
Example demonstrating how to OCR a document and access the text at the document, page, and 
block (or paragraph) level.
"""

from indico import IndicoClient, IndicoConfig
from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject

# Get the OCR object
my_config = IndicoConfig(host="app.indico.io",
                         api_token_path=".path/to/indico_api_token.txt")
client = IndicoClient(config=my_config)

files_to_extract = client.call(
    DocumentExtraction(files=["./test_paragraphs.pdf"],
                       json_config={"preset_config": "standard"}))
extracted_file = client.call(JobStatus(id=files_to_extract[0].id, wait=True))
json_result = client.call(RetrieveStorageObject(extracted_file.result))

# The code below shows how to get the OCR text from the 'json_result' object.
# Note: it may vary slightly if you use DocumentExtraction configurations other than 'standard'

# Full Text
full_document_text = json_result["text"]

# Doucment Text split by page
text_by_page = list()
for page in json_result["pages"]:
    text_by_page.append(page["text"])

# Document Text split by block (or paragraph)

Пример #7

Показать файл

from indico import IndicoClient, IndicoConfig
from indico.queries import DocumentExtraction, JobStatus, RetrieveStorageObject

# Create an Indico API client
my_config = IndicoConfig(host="app.indico.io",
                         api_token_path="./path/to/indico_api_token.txt")
client = IndicoClient(config=my_config)

# OCR a single file and wait for it to complete
job = client.call(
    DocumentExtraction(files=["./path_to_doc.pdf"],
                       json_config=dict(preset_config="ondocument")))
extracted_file = client.call(JobStatus(id=job[0].id, wait=True))

if extracted_file.status == "SUCCESS":
    result = client.call(RetrieveStorageObject(extracted_file.result))
    print(result)