Exemplo n.º 1
0
def to_text(path, **kwargs):
    """Sends PDF files to Google DocumentAi for OCR.

    Before using invoice2data, make sure you have the auth json path set as
    env var GOOGLE_APPLICATION_CREDENTIALS

    Parameters
    ----------
    path : str
        path of electronic invoice in JPG or PNG format
    bucket_name : str
        name of bucket to use for file storage and results cache.

    Returns
    -------
    extracted_str : str
        returns extracted text from image in JPG or PNG format

    """
    """OCR with PDF/TIFF as source files on GCS"""
    from google.cloud import documentai_v1 as documentai

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    project_id = kwargs.get("project_id", None)
    processor = kwargs.get("processor", None)
    name = f"projects/{project_id}/locations/eu/processors/{processor}"

    # Read the file into memory
    with open(path, "rb") as image:
        image_content = image.read()

    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "raw_document": document}

    result = client.process_document(request=request)
    document = result.document

    document_pages = document.pages

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    # Read the text recognition output from the processor
    paragraph_text = ""
    for page in document_pages:
        paragraphs = page.paragraphs
        for paragraph in paragraphs:
            paragraph_text += get_text(paragraph.layout, document)

    return paragraph_text.encode('utf-8')
Exemplo n.º 2
0
def start1003Parser(event, context):
    project_id = os.environ['project_id']
    processor_id = os.environ['processor_id']
    location = os.environ['location']
    table_id = os.environ['BQ_Table_Id']
    file_path = event['name']
    input_bucket = event['bucket']

    print("project_id:{},processor_id:{},file_path:{}, input bucket:{}".format(
        project_id, processor_id, file_path, input_bucket))

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    image_content = download_blob(input_bucket, file_path).download_as_bytes()
    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "raw_document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)

    document = result.document

    print("Document processing complete.")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    entityDict = {}
    entityDict["fileName"] = file_path
    for entity in document.entities:
        entity_type = entity.type_
        if (entity.normalized_value.text != ""):
            entity_text = entity.normalized_value.text
        else:
            entity_text = re.sub('[":\""]', '', entity.mention_text)

        # Placeholder code below to test whether the amount fields have strings with commas coming in. Converting them to floats for now.
        if ("amount" in entity_type and entity.normalized_value.text == ''):
            entity_text = float(re.sub('\D', '', entity.mention_text))

        entityDict[entity_type] = entity_text

    #Calling the WiteToBQ Method
    writeToBQ(entityDict, table_id)
def process_document_sample(
    project_id: str, location: str, processor_id: str, file_path: str
):
    from google.cloud import documentai_v1 as documentai

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as image:
        image_content = image.read()

    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "raw_document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)

    document = result.document

    print("Document processing complete.")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    document_pages = document.pages

    # Read the text recognition output from the processor
    print("The document contains the following paragraphs:")
    for page in document_pages:
        paragraphs = page.paragraphs
        for paragraph in paragraphs:
            paragraph_text = get_text(paragraph.layout, document)
            print(f"Paragraph text: {paragraph_text}")
def batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_input_uri,
    gcs_output_uri,
    gcs_output_uri_prefix,
    timeout: int = 300,
):

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

    gcs_documents = documentai.GcsDocuments(
        documents=[{
            "gcs_uri": gcs_input_uri,
            "mime_type": "application/pdf"
        }])

    # 'mime_type' can be 'application/pdf', 'image/tiff',
    # and 'image/gif', or 'application/json'
    input_config = documentai.BatchDocumentsInputConfig(
        gcs_documents=gcs_documents)

    # Where to write results
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config={"gcs_uri": destination_uri})

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=timeout)

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print("Output files:")

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        if ".json" in blob.name:
            blob_as_bytes = blob.download_as_bytes()

            document = documentai.types.Document.from_json(blob_as_bytes)
            print(f"Fetched file {i + 1}")

            # For a full list of Document object attributes, please reference this page:
            # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document

            # Read the text recognition output from the processor
            for page in document.pages:
                for form_field in page.form_fields:
                    field_name = get_text(form_field.field_name, document)
                    field_value = get_text(form_field.field_value, document)
                    print("Extracted key value pair:")
                    print(f"\t{field_name}, {field_value}")
                for paragraph in document.pages:
                    paragraph_text = get_text(paragraph.layout, document)
                    print(f"Paragraph text:\n{paragraph_text}")
        else:
            print(f"Skipping non-supported file type {blob.name}")
def process_document_sample(
    project_id: str, location: str, processor_id: str, file_path: str, template_path: str
):

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as image:
        image_content = image.read()
    template = cv2.imread(template_path)
    
    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "raw_document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)

    document = result.document

    print("Document processing complete.")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    document_pages = document.pages

    #initalising accuracy arrays
    doc_acc = []
    field_acc = np.zeros( (len(FIELD_NAMES),1) )
    doc_text = []
    confidence = []

    # Read the text recognition output from the processor
    print("The document contains the following paragraphs:")
    for page in document_pages:
        # # field_text = ''
        # for idx, paragraph in enumerate(page.paragraphs):
        #   field_text = get_text(paragraph.layout, document)
        #   print(f'Paragraph: {idx} \n{field_text} \n\n')

       for idx, field in enumerate(FIELD_NAMES):
         field_text = ''
         text_loc = []
         paragraph_confidence = []
         ground_field = ground_truth[field[0]]
         for paragraph_num in field.paragraph:
           field_text += get_text(page.paragraphs[paragraph_num].layout, document)
           paragraph_confidence.append(page.paragraphs[paragraph_num].layout.confidence)
           text_loc.append(page.paragraphs[paragraph_num].layout)
         
            
         field_text = remove_from_text(field_text, OCR_LOCATIONS[idx].filter_keywords)
         
         doc_text.append(field_text.replace('\n', ', '))
         field_acc[idx], doc_acc, field_text = ocr_acc(ground_field, field_text.split('\n'), doc_acc)
         print(f'Paragraph {idx}: (Field Accuracy = {field_acc[idx]}) \nParagraph Confidence: {sum(paragraph_confidence)/len(paragraph_confidence)} \n{field_text}\n\n')
         confidence.append(sum(paragraph_confidence)/len(paragraph_confidence))
         
    doc2csv(doc_text,confidence)
    put_OCR(field_text, layout, image)
    print( f'Document Accuracy = {doc_acc[idx]} \n Confidence: {confidence}')
    return document_pages, document, template