def parse_with_model( project_id='YOUR_PROJECT_ID', input_uri='gs://cloud-samples-data/documentai/invoice.pdf', automl_model_name='YOUR_AUTOML_MODEL_NAME'): """Process a single document with the Document AI API. Args: project_id: your Google Cloud project id input_uri: the Cloud Storage URI of your input PDF automl_model_name: the AutoML model name formatted as: `projects/[PROJECT_ID]/locations/[LOCATION]/models/[MODEL_ID] where LOCATION is a Compute Engine region, e.g. `us-central1` """ client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') automl_params = documentai.types.AutoMlParams(model=automl_model_name) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, automl_params=automl_params) document = client.process_document(request=request) for label in document.labels: print('Label detected: {}'.format(label.name)) print('Confidence: {}'.format(label.confidence))
def set_endpoint(project_id='YOUR_PROJECT_ID', input_uri='gs://cloud-samples-data/documentai/invoice.pdf'): """Process a single document with the Document AI API, including text extraction and entity extraction.""" # [START documentai_set_endpoint_beta] from google.cloud import documentai_v1beta2 as documentai client = documentai.DocumentUnderstandingServiceClient( client_options={'api_endpoint': 'eu-documentai.googleapis.com'}) # [END documentai_set_endpoint_beta] gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Location can be 'us' or 'eu' parent = 'projects/{}/locations/eu'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) # All text extracted from the document print('Document Text: {}'.format(document.text))
def parse_invoice(project_id='temporal-tensor-307222', input_uri='gs://cloud-samples-data/documentai/invoice.pdf'): # input_uri='gs://docu_test/AC8BR-05U.PDF'): """Procsingle document with the Document AI API, including text extraction and entity extraction.""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) print(type(document.content)) # create_json(json.dump(document)) # All text extracted from the document print('Document Text: {}'.format(document.content))
def get_parsed_document( project_id="white-flame-244921", input_uri="gs://cloud-samples-data/documentai/invoice.pdf", ): """Process a single document with the Document AI API, including text extraction and entity extraction.""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig( gcs_source=gcs_source, mime_type="image/tiff", ) # Location can be 'us' or 'eu' parent = "projects/{}/locations/eu".format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config ) document = client.process_document(request=request) return document
def get_form_fields(bucket, filename): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=f"gs://{bucket}/{filename}") # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key='Emergency Contact', value_types=['NAME']), documentai.types.KeyValuePairHint(key='Referred By') ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(os.environ["PROJECT_ID"]) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, form_extraction_params=form_extraction_params) document = client.process_document(request=request) def _get_text(el): """Doc AI identifies form fields by their offsets in document text. This function converts offsets to text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response # Return an array of form fields return [{ "filename": filename, "page": page.page_number, "form_field_name": _get_text(form_field.field_name), "form_field_value": _get_text(form_field.field_value) } for page in document.pages for form_field in page.form_fields]
def __init__(self, pdf): self.doc_in = pdf self.project_id = 'teak-span-275205' self.client = documentai.DocumentUnderstandingServiceClient() self.document = self.client.process_document( request=self.generate_request()) self.tables = self.get_tables() self.max_cols = self.get_max_cols() self.table_indices = []
def parse_form(project_id='YOUR_PROJECT_ID', input_uri='gs://cloud-samples-data/documentai/form.pdf'): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key='Emergency Contact', value_types=['NAME']), documentai.types.KeyValuePairHint(key='Referred By') ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, form_extraction_params=form_extraction_params) document = client.process_document(request=request) text = document.text t = text.split('\n') skills = [] for i in range(0, len(t)): x = t[i] if 'SKILLS' in x: j = i + 1 y = t[j] while ("●" in y or "•" in y or y.isupper() == False): new_y = y.replace("●", "") new_y = new_y.replace("•", "") skills.append(new_y) j += 1 y = t[j] return skills
def parse_file(input_uri): client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) print(document.entities) return [entity.mention_text for entity in document.entities]
def get_document_text(input_uri, project_id): project_id = project_id input_uri = input_uri client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) return document.text
def main(project_id='document-ai-project-291706', input_uri='gs://analysis_report_samples/sample_ocr_1.pdf'): # input_uri='gs://cloud-samples-data/documentai/invoice.pdf'): """Process a single document with the Document AI API, including text extraction and entity extraction.""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) file_ = open('output.text', 'w') # All text extracted from the document print('Document Text: {}'.format(document.text)) file_.write('Document Text: {}'.format(document.text)) def _get_text(el): """Convert text offset indexes into text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for entity in document.entities: # print('Entity type: {}'.format(entity.type)) print('Text: {}'.format(_get_text(entity))) file_.write('Text: {}'.format(_get_text(entity))) print('Mention text: {}\n'.format(entity.mention_text)) file_.write('Mention text: {}\n'.format(entity.mention_text)) file_.close()
def sample_analyze_entities(input_uri): """ Analyzing Entities in text file stored in Cloud Storage """ client = documentai.DocumentUnderstandingServiceClient(credentials=credentials) gcs_source = documentai.types.GcsSource(uri=input_uri) input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest(parent=parent, input_config=input_config) document = client.process_document(request=request) client = language_v1.LanguageServiceClient(credentials=credentials) type_ = enums.Document.Type.PLAIN_TEXT document = {"content": document.text, "type": type_} encoding_type = enums.EncodingType.UTF8 response = client.analyze_entities(document, encoding_type=encoding_type) # Loop through entitites returned from the API for entity in response.entities: print(u"Representative name for the entity: {}".format(entity.name)) # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name)) # Get the salience score associated with the entity in the [0, 1.0] range print(u"Salience score: {}".format(entity.salience)) # Loop over the metadata associated with entity. For many known entities, # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid). # Some entity types may have additional metadata, e.g. ADDRESS entities # may have metadata for the address street_name, postal_code, et al. for metadata_name, metadata_value in entity.metadata.items(): print(u"{}: {}".format(metadata_name, metadata_value)) # Loop over the mentions of this entity in the input document. # The API currently supports proper noun mentions. for mention in entity.mentions: print(u"Mention text: {}".format(mention.text.content)) # Get the mention type, e.g. PROPER for proper noun print(u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)) """
def main( project_id="YOUR_PROJECT_ID", input_uri="gs://cloud-samples-data/documentai/invoice.pdf", ): """Process a single document with the Document AI API, including text extraction and entity extraction.""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type="application/pdf") # Location can be 'us' or 'eu' parent = "projects/{}/locations/us".format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) # All text extracted from the document print("Document Text: {}".format(document.text)) def _get_text(el): """Convert text offset indexes into text snippets.""" response = "" # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for entity in document.entities: print("Entity type: {}".format(entity.type_)) print("Text: {}".format(_get_text(entity))) print("Mention text: {}\n".format(entity.mention_text))
def parse_form(project_id='quantiphi-ttest', input_uri='gs://document_ai1/Payslip_11176322 (2).pdf'): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig( gcs_source=gcs_source, mime_type='application/pdf') # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key='Personnel No', ), documentai.types.KeyValuePairHint( key='Name', value_types=['NAME']), documentai.types.KeyValuePairHint( key='Bank'), documentai.types.KeyValuePairHint( key='Bank A/c No'), documentai.types.KeyValuePairHint( key='DOJ'), documentai.types.KeyValuePairHint( key='LOP Days'), documentai.types.KeyValuePairHint( key='PF No.'), documentai.types.KeyValuePairHint( key='Location'), documentai.types.KeyValuePairHint( key='Facility'), documentai.types.KeyValuePairHint( key='Department'), documentai.types.KeyValuePairHint( key='INCOME TAX'), documentai.types.KeyValuePairHint( key='PROFESSIONAL TAX'), documentai.types.KeyValuePairHint( key='GROSS DEDUCTIONS'), documentai.types.KeyValuePairHint( key='PROVIDENT FUND'), documentai.types.KeyValuePairHint( key='NGO CONTRIBUTION'), documentai.types.KeyValuePairHint( key='PF – UAN'), ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, form_extraction_params=form_extraction_params) document = client.process_document(request=request) def _get_text(el): """Doc AI identifies form fields by their offsets in document text. This function converts offsets to text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response jsonDict = {} for page in document.pages: print('Page number: {}'.format(page.page_number)) for form_field in page.form_fields: # fieldNames.append(_get_text(form_field.field_name)) print('Field Name: {}\tConfidence: {}'.format( _get_text(form_field.field_name), form_field.field_name.confidence)) # fieldValues.append(_get_text(form_field.field_value)) print('Field Value: {}\tConfidence: {}'.format( _get_text(form_field.field_value), form_field.field_value.confidence)) jsonDict[_get_text(form_field.field_name) \ .strip() \ .replace('PF \u2013 UAN', 'UAN')] = _get_text(form_field.field_value) \ .replace('\n','') \ .strip() print(json.dumps(jsonDict)) client = bigquery.Client() filename = '/path/to/file/in/nd-format.json' dataset_id = 'quantiphi' table_id = 'dataLoad'
def parse_table(project_id, input_uri, filename, cred): """Parse a form""" RIGHE = [] RIGHE = pd.DataFrame(RIGHE) client = documentai.DocumentUnderstandingServiceClient(credentials=cred) gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Improve table parsing results by providing bounding boxes # specifying where the box appears in the document (optional) table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex(x=0, y=0), # Top right documentai.types.geometry.NormalizedVertex(x=1, y=0), # Bottom right documentai.types.geometry.NormalizedVertex(x=1, y=1), # Bottom left documentai.types.geometry.NormalizedVertex(x=0, y=1) ])) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params) document = client.process_document(request=request) def _get_text(el): """Convert text offset indexes into text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for page in document.pages: print('Page number: {}'.format(page.page_number)) for table_num, table in enumerate(page.tables): print('Table {}: '.format(table_num)) for row_num, row in enumerate(table.header_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) print('Header Row {}: {}'.format(row_num, cells)) ppp1 = cells.split('\n') ppp1 = pd.DataFrame([x.split('\t') for x in ppp1]) ppp1['RowNum_Header'] = row_num ppp1['Table'] = table_num ppp1['Page'] = page.page_number RIGHE = RIGHE.append(ppp1) for row_num, row in enumerate(table.body_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) print('Row {}: {}'.format(row_num, cells)) ###MODIFICHE MARCO PER CERCARE DI METTERE IN DATAFRAME ANZICHE PRINTARE ppp1 = cells.split('\n') ppp1 = pd.DataFrame([x.split('\t') for x in ppp1]) ppp1['RowNum'] = row_num ppp1['Table'] = table_num ppp1['Page'] = page.page_number RIGHE = RIGHE.append(ppp1) FF = os.path.splitext(filename)[0] FF = FF + '.pkl' RIGHE.to_pickle(os.path.join(FF), protocol=2) return RIGHE
def batch_parse_table( project_id='YOUR_PROJECT_ID', input_uri='gs://cloud-samples-data/documentai/form.pdf', destination_uri='gs://your-bucket-id/path/to/save/results/'): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # where to write results output_config = documentai.types.OutputConfig( gcs_destination=documentai.types.GcsDestination(uri=destination_uri), pages_per_shard=1 # Map one doc page to one output page ) # Improve table parsing results by providing bounding boxes # specifying where the box appears in the document (optional) table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex(x=0, y=0), # Top right documentai.types.geometry.NormalizedVertex(x=1, y=0), # Bottom right documentai.types.geometry.NormalizedVertex(x=1, y=1), # Bottom left documentai.types.geometry.NormalizedVertex(x=0, y=1) ])) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( input_config=input_config, output_config=output_config, table_extraction_params=table_extraction_params) requests = [] requests.append(request) batch_request = documentai.types.BatchProcessDocumentsRequest( parent=parent, requests=requests) operation = client.batch_process_documents(batch_request) # Wait for the operation to finish operation.result() # Results are written to GCS. Use a regex to find # output files match = re.match(r'gs://([^/]+)/(.+)', destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.client.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name)
def parse_table( project_id="YOUR_PROJECT_ID", input_uri="gs://cloud-samples-data/documentai/invoice.pdf", ): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type="application/pdf") # Improve table parsing results by providing bounding boxes # specifying where the box appears in the document (optional) table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex(x=0, y=0), # Top right documentai.types.geometry.NormalizedVertex(x=1, y=0), # Bottom right documentai.types.geometry.NormalizedVertex(x=1, y=1), # Bottom left documentai.types.geometry.NormalizedVertex(x=0, y=1), ]), ) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) # Location can be 'us' or 'eu' parent = "projects/{}/locations/us".format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params, ) document = client.process_document(request=request) def _get_text(el): """Convert text offset indexes into text snippets.""" response = "" # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for page in document.pages: print("Page number: {}".format(page.page_number)) for table_num, table in enumerate(page.tables): print("Table {}: ".format(table_num)) for row_num, row in enumerate(table.header_rows): cells = "\t".join( [_get_text(cell.layout) for cell in row.cells]) print("Header Row {}: {}".format(row_num, cells)) for row_num, row in enumerate(table.body_rows): cells = "\t".join( [_get_text(cell.layout) for cell in row.cells]) print("Row {}: {}".format(row_num, cells))
def parse_form( project_id="YOUR_PROJECT_ID", input_uri="gs://cloud-samples-data/documentai/form.pdf", ): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type="application/pdf") # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key="Emergency Contact", value_types=["NAME"]), documentai.types.KeyValuePairHint(key="Referred By"), ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = "projects/{}/locations/us".format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, form_extraction_params=form_extraction_params, ) document = client.process_document(request=request) def _get_text(el): """Doc AI identifies form fields by their offsets in document text. This function converts offsets to text snippets. """ response = "" # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for page in document.pages: print("Page number: {}".format(page.page_number)) for form_field in page.form_fields: print("Field Name: {}\tConfidence: {}".format( _get_text(form_field.field_name), form_field.field_name.confidence)) print("Field Value: {}\tConfidence: {}".format( _get_text(form_field.field_value), form_field.field_value.confidence))
def parse_table(project_id='ons-companies-house-dev', input_uri='gs://ons-companies-house-dev-scraped-pdf-data/doc_ai_outputs/bs_pdfs/04391694_active_bs.pdf', print_stuff = False): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig( gcs_source=gcs_source, mime_type='application/pdf') # Improve table parsing results by providing bounding boxes # specifying where the box appears in the document (optional) table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex( x=0, y=0 ), # Top right documentai.types.geometry.NormalizedVertex( x=1, y=0 ), # Bottom right documentai.types.geometry.NormalizedVertex( x=1, y=1 ), # Bottom left documentai.types.geometry.NormalizedVertex( x=0, y=1 ) ] ) ) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints,model_version = "builtin/latest", header_hints = ["At 31 December\n", "2019\n", "$ million2018\n"]) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params) document = client.process_document(request=request) return(document)
def parse_table(filename, condense=False): input_uri = secrets.token_hex(nbytes=16) blob = bucket.blob(input_uri) blob.upload_from_filename(filename) client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=BUCKET_URL + input_uri) input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type="application/pdf") table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly(normalized_vertices=[ documentai.types.geometry.NormalizedVertex(x=0, y=0), documentai.types.geometry.NormalizedVertex(x=1, y=0), documentai.types.geometry.NormalizedVertex(x=1, y=1), documentai.types.geometry.NormalizedVertex(x=0, y=1), ]), ) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) parent = "projects/{}/locations/us".format(PROJECT_ID) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params, ) document = client.process_document(request=request) def _get_text(el): response = "" for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] for char in ["\n", "\t", "•", ":"]: response = response.replace(char, " ") while " " in char: response = response.replace(" ", " ") return response.strip() out = [] for page in document.pages: for table_num, table in enumerate(page.tables): for row_num, row in enumerate(table.header_rows): out.append("|".join( [_get_text(cell.layout) for cell in row.cells])) for row_num, row in enumerate(table.body_rows): out.append("|".join( [_get_text(cell.layout) for cell in row.cells])) out.append("") data = "\n".join(out)[:6000] if len(data) < 1000: data = document.text if condense: data = summarize(data) out_uri = secrets.token_hex(nbytes=16) blob = bucket.blob(out_uri) blob.upload_from_string(data) return out_uri, data
def parse_invoice( project_id='temporal-tensor-307222', # input_uri='gs://cloud-samples-data/documentai/invoice.pdf'): input_uri='gs://docu_test/AC8BR-05U-2.pdf'): """Procsingle document with the Document AI API, including text extraction and entity extraction.""" destination_uri = 'gs://docu_test/' client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config) document = client.process_document(request=request) # create_json(document) # All text extracted from the document # print('Document Text: {}'.format(document)) f = open('file.txt', 'w') f.write('dict = ' + repr(document) + '\n') f.close() document_pages = document.pages # print(document_pages) rows_to_insert = [] # Read the text recognition output from the processor print("The document contains the following paragraphs:") for page in document_pages: paragraphs = page.paragraphs for paragraph in paragraphs: paragraph_text = get_text(paragraph.layout, document) # print(f"Paragraph text: {paragraph_text}") print(paragraph_text) y = json.dumps(paragraph_text) # the result is a JSON string: print(y) rows_to_insert.append(y) print(rows_to_insert) with open('data.txt', 'w') as outfile: json.dump(rows_to_insert, outfile) upload_blob("docu_test", "data.txt", "data.json") from google.cloud import bigquery # # Construct a BigQuery client object. client = bigquery.Client() # # TODO(developer): Set table_id to the ID of the table to create. table_id = "temporal-tensor-307222.docu_test.printer_tab" # # Set the encryption key to use for the destination. # # TODO: Replace this key with a key you have created in KMS. # # kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format( # # "cloud-samples-tests", "us", "test", "test" # # ) job_config = bigquery.LoadJobConfig( autodetect=True, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON) uri = "gs://docu_test/data.json" load_job = client.load_table_from_uri( uri, table_id, job_config=job_config) # Make an API request. load_job.result() # Waits for the job to complete. destination_table = client.get_table(table_id) print("Loaded {} rows.".format(destination_table.num_rows)) # document1 =documentai.types.Document.from_json(document) if (1 == 2): # Results are written to GCS. Use a regex to find # output files match = re.match(r"gs://([^/]+)/(.+)", destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print("Output files:") for i, blob in enumerate(blob_list): # If JSON file, download the contents of this blob as a bytes object. if ".json" in blob.name: blob_as_bytes = blob.download_as_bytes() document = documentai.types.Document.from_json(blob_as_bytes) print(f"Fetched file {i + 1}") # For a full list of Document object attributes, please reference this page: # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document # Read the text recognition output from the processor for page in document.pages: for form_field in page.form_fields: field_name = get_text(form_field.field_name, document) field_value = get_text(form_field.field_value, document) print("Extracted key value pair:") print(f"\t{field_name}, {field_value}") for paragraph in document.pages: paragraph_text = get_text(paragraph.layout, document) # print(f"Paragraph text:\n{paragraph_text}") else: print(f"Skipping non-supported file type {blob.name}")
def parse_table(project_id, input_uri): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Improve table parsing results by providing bounding boxes # specifying where the box appears in the document (optional) table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex(x=0, y=0), # Top right documentai.types.geometry.NormalizedVertex(x=1, y=0), # Bottom right documentai.types.geometry.NormalizedVertex(x=1, y=1), # Bottom left documentai.types.geometry.NormalizedVertex(x=0, y=1) ])) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params) document = client.process_document(request=request) def _get_text(el): """Convert text offset indexes into text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response pdf_filename = re.findall("[\w.-]+?(?=\.)", input_uri) pdf_filename = pdf_filename[0] print("pdf_filename", pdf_filename) file_name_txt = '/home/srinidhi/angular/upload/' + pdf_filename + '.txt' for page in document.pages: file = open(file_name_txt, "a") # file.write('Page number: {}'.format(page.page_number)) for table_num, table in enumerate(page.tables): #file.write('Table {}: '.format(table_num)) for row_num, row in enumerate(table.header_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) file.write('Row:{}'.format(cells)) for row_num, row in enumerate(table.body_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) file.write('Row:{}'.format(cells)) file.close()
def parse_form(input_uri): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key='Emergency Contact', value_types=['NAME']), documentai.types.KeyValuePairHint(key='Referred By') ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, form_extraction_params=form_extraction_params) document = client.process_document(request=request) def _get_text(el): """Doc AI identifies form fields by their offsets in document text. This function converts offsets to text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response payload = dict() for page in document.pages: for form_field in page.form_fields: name = _get_text(form_field.field_name).rstrip() value = _get_text(form_field.field_value).rstrip() payload[name] = value # if _get_text(form_field.field_name) == 'Requester : Name': # print('Field Name: {}\tConfidence: {}'.format( # _get_text(form_field.field_name), # form_field.field_name.confidence)) # print('Field Value: {}\tConfidence: {}'.format( # _get_text(form_field.field_value), # form_field.field_value.confidence)) return payload
def batch_parse_form( project_id='YOUR_PROJECT_ID', input_uri='gs://cloud-samples-data/documentai/form.pdf', destination_uri='gs://your-bucket-id/path/to/save/results/'): """Parse a form""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf') # where to write results output_config = documentai.types.OutputConfig( gcs_destination=documentai.types.GcsDestination(uri=destination_uri), pages_per_shard=1 # Map one doc page to one output page ) # Improve form parsing results by providing key-value pair hints. # For each key hint, key is text that is likely to appear in the # document as a form field name (i.e. "DOB"). # Value types are optional, but can be one or more of: # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME key_value_pair_hints = [ documentai.types.KeyValuePairHint(key='Emergency Contact', value_types=['NAME']), documentai.types.KeyValuePairHint(key='Referred By') ] # Setting enabled=True enables form extraction form_extraction_params = documentai.types.FormExtractionParams( enabled=True, key_value_pair_hints=key_value_pair_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( input_config=input_config, output_config=output_config, form_extraction_params=form_extraction_params) # Add each ProcessDocumentRequest to the batch request requests = [] requests.append(request) batch_request = documentai.types.BatchProcessDocumentsRequest( parent=parent, requests=requests) operation = client.batch_process_documents(batch_request) # Wait for the operation to finish operation.result() # Results are written to GCS. Use a regex to find # output files match = re.match(r'gs://([^/]+)/(.+)', destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.client.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name)
def main(project_id='document-ai-project-291706', input_uri='gs://analysis_report_samples/sample_ocr_7.pdf'): # input_uri='gs://cloud-samples-data/documentai/invoice.pdf'): """Process a single document with the Document AI API, including text extraction and entity extraction.""" client = documentai.DocumentUnderstandingServiceClient() gcs_source = documentai.types.GcsSource(uri=input_uri) # mime_type can be application/pdf, image/tiff, # and image/gif, or application/json input_config = documentai.types.InputConfig( gcs_source=gcs_source, mime_type='application/pdf') # insert table extraction code here table_bound_hints = [ documentai.types.TableBoundHint( page_number=1, bounding_box=documentai.types.BoundingPoly( # Define a polygon around tables to detect # Each vertice coordinate must be a number between 0 and 1 normalized_vertices=[ # Top left documentai.types.geometry.NormalizedVertex( x=0, y=0 ), # Top right documentai.types.geometry.NormalizedVertex( x=1, y=0 ), # Bottom right documentai.types.geometry.NormalizedVertex( x=1, y=1 ), # Bottom left documentai.types.geometry.NormalizedVertex( x=0, y=1 ) ] ) ) ] # Setting enabled=True enables form extraction table_extraction_params = documentai.types.TableExtractionParams( enabled=True, table_bound_hints=table_bound_hints) # Location can be 'us' or 'eu' parent = 'projects/{}/locations/us'.format(project_id) request = documentai.types.ProcessDocumentRequest( parent=parent, input_config=input_config, table_extraction_params=table_extraction_params) document = client.process_document(request=request) file_= open('data_report_6.text','w') file_.write('*BEGIN_TEXT_EXTRACTION \n\n') # All text extracted from the document print('Document Text: {}'.format(document.text)) file_.write('Document Text: {}'.format(document.text)) def _get_text(el): """Convert text offset indexes into text snippets. """ response = '' # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response for entity in document.entities: # print('Entity type: {}'.format(entity.type)) print('Text: {}'.format(_get_text(entity))) file_.write('Text: {}'.format(_get_text(entity))) print('Mention text: {}\n'.format(entity.mention_text)) file_.write('Mention text: {}\n'.format(entity.mention_text)) file_.write('\n *BEGIN_TABLE_EXTRACTION \n\n') for page in document.pages: print('Page number: {}'.format(page.page_number)) file_.write('Page number: {}'.format(page.page_number)) for table_num, table in enumerate(page.tables): print('Table {}: '.format(table_num)) file_.write('Table {}: '.format(table_num)) for row_num, row in enumerate(table.header_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) print('Header Row {}: {}'.format(row_num, cells)) file_.write('Header Row {}: {}'.format(row_num, cells)) for row_num, row in enumerate(table.body_rows): cells = '\t'.join( [_get_text(cell.layout) for cell in row.cells]) print('Row {}: {}'.format(row_num, cells)) file_.write('Row {}: {}'.format(row_num, cells)) file_.write('\n\n END_OCR') file_.close()