def to_text(path, **kwargs): """Sends PDF files to Google DocumentAi for OCR. Before using invoice2data, make sure you have the auth json path set as env var GOOGLE_APPLICATION_CREDENTIALS Parameters ---------- path : str path of electronic invoice in JPG or PNG format bucket_name : str name of bucket to use for file storage and results cache. Returns ------- extracted_str : str returns extracted text from image in JPG or PNG format """ """OCR with PDF/TIFF as source files on GCS""" from google.cloud import documentai_v1 as documentai # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first project_id = kwargs.get("project_id", None) processor = kwargs.get("processor", None) name = f"projects/{project_id}/locations/eu/processors/{processor}" # Read the file into memory with open(path, "rb") as image: image_content = image.read() document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "raw_document": document} result = client.process_document(request=request) document = result.document document_pages = document.pages # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document # Read the text recognition output from the processor paragraph_text = "" for page in document_pages: paragraphs = page.paragraphs for paragraph in paragraphs: paragraph_text += get_text(paragraph.layout, document) return paragraph_text.encode('utf-8')
def start1003Parser(event, context): project_id = os.environ['project_id'] processor_id = os.environ['processor_id'] location = os.environ['location'] table_id = os.environ['BQ_Table_Id'] file_path = event['name'] input_bucket = event['bucket'] print("project_id:{},processor_id:{},file_path:{}, input bucket:{}".format( project_id, processor_id, file_path, input_bucket)) # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" image_content = download_blob(input_bucket, file_path).download_as_bytes() # Read the file into memory document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "raw_document": document} # Recognizes text entities in the PDF document result = client.process_document(request=request) document = result.document print("Document processing complete.") # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document entityDict = {} entityDict["fileName"] = file_path for entity in document.entities: entity_type = entity.type_ if (entity.normalized_value.text != ""): entity_text = entity.normalized_value.text else: entity_text = re.sub('[":\""]', '', entity.mention_text) # Placeholder code below to test whether the amount fields have strings with commas coming in. Converting them to floats for now. if ("amount" in entity_type and entity.normalized_value.text == ''): entity_text = float(re.sub('\D', '', entity.mention_text)) entityDict[entity_type] = entity_text #Calling the WiteToBQ Method writeToBQ(entityDict, table_id)
def process_document_sample( project_id: str, location: str, processor_id: str, file_path: str ): from google.cloud import documentai_v1 as documentai # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" with open(file_path, "rb") as image: image_content = image.read() # Read the file into memory document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "raw_document": document} # Recognizes text entities in the PDF document result = client.process_document(request=request) document = result.document print("Document processing complete.") # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document document_pages = document.pages # Read the text recognition output from the processor print("The document contains the following paragraphs:") for page in document_pages: paragraphs = page.paragraphs for paragraph in paragraphs: paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text: {paragraph_text}")
def batch_process_documents( project_id, location, processor_id, gcs_input_uri, gcs_output_uri, gcs_output_uri_prefix, timeout: int = 300, ): # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" gcs_documents = documentai.GcsDocuments( documents=[{ "gcs_uri": gcs_input_uri, "mime_type": "application/pdf" }]) # 'mime_type' can be 'application/pdf', 'image/tiff', # and 'image/gif', or 'application/json' input_config = documentai.BatchDocumentsInputConfig( gcs_documents=gcs_documents) # Where to write results output_config = documentai.DocumentOutputConfig( gcs_output_config={"gcs_uri": destination_uri}) # Location can be 'us' or 'eu' name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" request = documentai.types.document_processor_service.BatchProcessRequest( name=name, input_documents=input_config, document_output_config=output_config, ) operation = client.batch_process_documents(request) # Wait for the operation to finish operation.result(timeout=timeout) # Results are written to GCS. Use a regex to find # output files match = re.match(r"gs://([^/]+)/(.+)", destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print("Output files:") for i, blob in enumerate(blob_list): # If JSON file, download the contents of this blob as a bytes object. if ".json" in blob.name: blob_as_bytes = blob.download_as_bytes() document = documentai.types.Document.from_json(blob_as_bytes) print(f"Fetched file {i + 1}") # For a full list of Document object attributes, please reference this page: # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document # Read the text recognition output from the processor for page in document.pages: for form_field in page.form_fields: field_name = get_text(form_field.field_name, document) field_value = get_text(form_field.field_value, document) print("Extracted key value pair:") print(f"\t{field_name}, {field_value}") for paragraph in document.pages: paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text:\n{paragraph_text}") else: print(f"Skipping non-supported file type {blob.name}")
def process_document_sample( project_id: str, location: str, processor_id: str, file_path: str, template_path: str ): # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" with open(file_path, "rb") as image: image_content = image.read() template = cv2.imread(template_path) # Read the file into memory document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "raw_document": document} # Recognizes text entities in the PDF document result = client.process_document(request=request) document = result.document print("Document processing complete.") # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document document_pages = document.pages #initalising accuracy arrays doc_acc = [] field_acc = np.zeros( (len(FIELD_NAMES),1) ) doc_text = [] confidence = [] # Read the text recognition output from the processor print("The document contains the following paragraphs:") for page in document_pages: # # field_text = '' # for idx, paragraph in enumerate(page.paragraphs): # field_text = get_text(paragraph.layout, document) # print(f'Paragraph: {idx} \n{field_text} \n\n') for idx, field in enumerate(FIELD_NAMES): field_text = '' text_loc = [] paragraph_confidence = [] ground_field = ground_truth[field[0]] for paragraph_num in field.paragraph: field_text += get_text(page.paragraphs[paragraph_num].layout, document) paragraph_confidence.append(page.paragraphs[paragraph_num].layout.confidence) text_loc.append(page.paragraphs[paragraph_num].layout) field_text = remove_from_text(field_text, OCR_LOCATIONS[idx].filter_keywords) doc_text.append(field_text.replace('\n', ', ')) field_acc[idx], doc_acc, field_text = ocr_acc(ground_field, field_text.split('\n'), doc_acc) print(f'Paragraph {idx}: (Field Accuracy = {field_acc[idx]}) \nParagraph Confidence: {sum(paragraph_confidence)/len(paragraph_confidence)} \n{field_text}\n\n') confidence.append(sum(paragraph_confidence)/len(paragraph_confidence)) doc2csv(doc_text,confidence) put_OCR(field_text, layout, image) print( f'Document Accuracy = {doc_acc[idx]} \n Confidence: {confidence}') return document_pages, document, template