def get_tables_string(textract_json_string: str, table_format: Pretty_Print_Table_Format = Pretty_Print_Table_Format.github, with_confidence: bool = False, with_geo: bool = False) -> str: """ doc: Textract response in form of trp.Document (https://github.com/aws-samples/amazon-textract-response-parser/tree/master/src-python) table_format: uses tabulate to pretty print the tabels to ascii. See https://pypi.org/project/tabulate/ for a lsit of table format values with_confidence: output confidence scores as well with_geo: output geo information as well """ logger.debug(f"table_format: {table_format}") doc = trp.Document(json.loads(textract_json_string)) result_value = "" if not table_format==Pretty_Print_Table_Format.csv: for page in doc.pages: for table in page.tables: table_list = convert_table_to_list( table, with_confidence=with_confidence, with_geo=with_geo) result_value += tabulate(table_list, tablefmt=table_format.name) + "\n\n" if table_format==Pretty_Print_Table_Format.csv: logger.debug(f"pretty print - csv") csv_output = StringIO() csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for page in doc.pages: for table in page.tables: table_list = convert_table_to_list( table, with_confidence=with_confidence, with_geo=with_geo) csv_writer.writerows(table_list) csv_writer.writerow([]) result_value = csv_output.getvalue() return result_value
def get_forms_string(textract_json_string: str, table_format: Pretty_Print_Table_Format = Pretty_Print_Table_Format.github, with_confidence: bool = False, with_geo: bool = False) -> str: """ returns string with key-values printed out in format: key: value """ logger.debug(f"table_format: {table_format}") doc = trp.Document(json.loads(textract_json_string)) result_value = "" if not table_format==Pretty_Print_Table_Format.csv: for page in doc.pages: forms_list = convert_form_to_list( page.form, with_confidence=with_confidence, with_geo=with_geo) result_value += tabulate(forms_list, tablefmt=table_format.name) + "\n\n" if table_format==Pretty_Print_Table_Format.csv: logger.debug(f"pretty print - csv") csv_output = StringIO() csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for page in doc.pages: forms_list = convert_form_to_list( page.form, with_confidence=with_confidence, with_geo=with_geo) csv_writer.writerows(forms_list) csv_writer.writerow([]) result_value = csv_output.getvalue() return result_value
def extractTextract(bucket, textractObjectName): response = textract.start_document_analysis(DocumentLocation={ 'S3Object': { 'Bucket': bucket, 'Name': textractObjectName } }, FeatureTypes=[ 'TABLES', ]) textractJobId = response["JobId"] print('job id is: ', textractJobId) time.sleep(15) response = textract.get_document_analysis(JobId=textractJobId) status = response["JobStatus"] while (status == "IN_PROGRESS"): time.sleep(5) response = textract.get_document_analysis(JobId=textractJobId) status = response["JobStatus"] print("Textract Job status: {}".format(status)) pages = extract_text(textractJobId, response) doc = trp.Document(pages) return doc
def test_tblock_order_block_by_geo_multi_page(): p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib_multi_page_tables.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = order_blocks_by_geo(t_document) doc = t1.Document(t2.TDocumentSchema().dump(t_document)) assert "Page 1 - Value 1.1.1" == doc.pages[0].tables[0].rows[0].cells[0].text.strip() assert "Page 1 - Value 2.1.1" == doc.pages[0].tables[1].rows[0].cells[0].text.strip()
def test_next_token_response(): p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib.json")) j = json.load(f) assert j['NextToken'] t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert t_document.pages[0].custom doc = t1.Document(t2.TDocumentSchema().dump(t_document)) for page in doc.pages: print(page.custom['Orientation'])
def ExecuteTableValidations(t_doc: t2.TDocument, header_footer_type: HeaderFooterType, accuracy_percentage: float): """ Invoke validations for first and last tables on all pages recursively """ page_compare_proc = 0 table_ids_to_merge = {} table_ids_merge_list = [] from trp.t_pipeline import order_blocks_by_geo ordered_doc = order_blocks_by_geo(t_doc) trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc)) for current_page in trp_doc.pages: if (page_compare_proc >= len(trp_doc.pages) - 1): break if (len(current_page.tables) == 0 or len(current_page.tables) == 0): page_compare_proc += 1 break current_page_table = current_page.tables[len(current_page.tables) - 1] next_page = trp_doc.pages[page_compare_proc + 1] next_page_table = next_page.tables[0] result_1 = __validate_objects_between_tables(current_page, current_page_table, next_page, next_page_table, header_footer_type) if (result_1): result_2_1 = __compare_table_column_numbers( current_page_table, next_page_table) result_2_2 = __compare_table_headers(current_page_table, next_page_table) if (result_2_1 or result_2_2): result3 = __compare_table_dimensions(current_page_table, next_page_table, accuracy_percentage) if (result3): table_ids_to_merge[ next_page_table.id] = current_page_table.id if (table_ids_merge_list): if (any(merge_pairs[1] == current_page_table.id for merge_pairs in table_ids_merge_list)): table_ids_merge_list[len(table_ids_merge_list) - 1].append(next_page_table.id) else: table_ids_merge_list.append( [current_page_table.id, next_page_table.id]) page_compare_proc += 1 return table_ids_merge_list
def get_lines_string(textract_json_string: str, with_page_number: bool = False) -> str: """ returns string with lines seperated by \n """ doc = trp.Document(json.loads(textract_json_string)) i = 0 result_value = "" for page in doc.pages: if with_page_number: result_value += f"--------- page number: {i} - page ID: {page.id} --------------" for line in page.lines: result_value += f"{line.text}\n" i += 1 return result_value
def test_kv_ocr_confidence(caplog): caplog.set_level(logging.DEBUG) p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/employment-application.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_kv_ocr_confidence(t_document) doc = t1.Document(t2.TDocumentSchema().dump(t_document)) for page in doc.pages: k1 = page.form.getFieldByKey("Home Address:") k1.key.custom['OCRConfidence'] == {'mean': 99.60698318481445} k1.value.custom['OCRConfidence'] == {'mean': 99.8596928914388} k1 = page.form.getFieldByKey("Phone Number:") k1.key.custom['OCRConfidence'] == {'mean': 99.55334854125977} k1.value.custom['OCRConfidence'] == {'mean': 99.23233032226562}
def get_words_string(textract_json: dict, with_page_number: bool = False) -> str: """ returns string with words seperated by \n """ doc = trp.Document(textract_json) i = 0 result_value = "" for page in doc.pages: if with_page_number: result_value += f"--------- page number: {i} - page ID: {page.id} --------------" for line in page.lines: for word in line.words: result_value += f"{word.text}\n" i += 1 return result_value
def test_adjust_bounding_boxes_and_polygons_to_orientation(): # p = os.path.dirname(os.path.realpath(__file__)) # f = open(os.path.join(p, "data/gib.json")) # j = json.load(f) # t_document: t2.TDocument = t2.TDocumentSchema().load(j) # t_document = add_page_orientation(t_document) # doc = t1.Document(t2.TDocumentSchema().dump(t_document)) # key = "Date:" # fields = doc.pages[0].form.searchFieldsByKey(key) # for field in fields: # print(f"Field: Key: {field.key}, Value: {field.value}, Geo: {field.geometry} ") p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__180_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) new_order = order_blocks_by_geo(t_document) doc = t1.Document(t2.TDocumentSchema().dump(t_document))
def lambda_handler(event, context): # Amazon Textract textract = boto3.client(service_name='textract', region_name='us-east-1') # Amazon s3 s3 = boto3.client('s3') try: obj = event["Records"][0]["s3"] bucket = str(obj["bucket"]["name"]) file_name = str(obj["object"]["key"]) file_name_final = file_name.split(".") # AWS textract being from here response = textract.analyze_document( Document={'S3Object': { 'Bucket': bucket, 'Name': file_name }}, FeatureTypes=['TABLES', 'FORMS']) #calling textract parser module doc = trp.Document(response) #table_content = [] line_content = [] content_table = [] content_form = [] #Looping through doc response for page in doc.pages: for line in page.lines: #line_content += (line.text) + "\n" line_content.append(line.text) #for forms forms = hp.outputForm(page) for items in forms: #content += '\n' for item in items: #content +=item content_form.append(item) # for tables content_table = hp.outputTable(page) # removing duplicates for line in line_content: #print("line value => ",line) for item in content_table: #print("item value before => ",item) if line in item: #print("line after delete => ",line) line_index = line_content.index(line) line_content[line_index] = "table" break #final removable of duplicates final_line_list = hp.Remove(line_content) for item in final_line_list: if "table" in item: final_line_list.remove("table") # copying the list elements into text content = "" for item in final_line_list: content += item + ' ' for items in content_table: #print('') content += '\n' for item in items: content += item + '\t' #uploading the file into the bucket s3.put_object(Bucket=bucket, Key="text_files/{}.txt".format(file_name_final[0]), Body=content) except Exception as e: raise else: pass finally: pass
Document={ 'S3Object': { 'Bucket':'your_bucket_name', #'Name':str(sys.argv[1]) 'Name' : file_name } }, FeatureTypes=['TABLES','FORMS']) print('') doc = trp.Document(response) content ='' for page in doc.pages: table = outputTable(page) for items in table: #print('') content += '\n' for item in items: content += item + '\t' #print(item,'\t',end=' ') #forms = outputForm(page) s3.Object('your_bucket_name',file_name+'.txt').put(Body=content)
def test_custom_page_orientation(json_response): doc = Document(json_response) assert 1 == len(doc.pages) lines = [line for line in doc.pages[0].lines] assert 22 == len(lines) words = [word for line in lines for word in line.words] assert 53 == len(words) t_document: t2.TDocument = t2.TDocumentSchema().load(json_response) t_document.custom = {'orientation': 180} new_t_doc_json = t2.TDocumentSchema().dump(t_document) assert "Custom" in new_t_doc_json assert "orientation" in new_t_doc_json["Custom"] assert new_t_doc_json["Custom"]["orientation"] == 180 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -1 < t_document.pages[0].custom['Orientation'] < 2 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib_10_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 5 < t_document.pages[0].custom['Orientation'] < 15 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__15_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 10 < t_document.pages[0].custom['Orientation'] < 20 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__25_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 17 < t_document.pages[0].custom['Orientation'] < 30 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__180_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 170 < t_document.pages[0].custom['Orientation'] < 190 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__270_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -100 < t_document.pages[0].custom['Orientation'] < -80 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__90_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 80 < t_document.pages[0].custom['Orientation'] < 100 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__minus_10_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -10 < t_document.pages[0].custom['Orientation'] < 5 doc = t1.Document(t2.TDocumentSchema().dump(t_document)) for page in doc.pages: assert page.custom['Orientation']
def test_Document(): with open(blocks_json, "rt") as f: blocks = json.load(f) doc = trp.Document(blocks) assert doc
def get_bounding_boxes( textract_json: dict, overlay_features: List[Textract_Types], document_dimensions: DocumentDimensions) -> List[BoundingBox]: doc = trp.Document(textract_json) bounding_box_list: List[BoundingBox] = list() page_number: int = 0 for page in doc.pages: page_number += 1 if Textract_Types.WORD in overlay_features or Textract_Types.LINE in overlay_features: for line in page.lines: if Textract_Types.LINE in overlay_features: if line: bounding_box_list.append( BoundingBox( geometry=line.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.LINE, page_number=page_number)) if Textract_Types.WORD in overlay_features: for word in line.words: if word: bounding_box_list.append( BoundingBox( geometry=word.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.WORD, page_number=page_number)) if any([ x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.KEY, Textract_Types.VALUE] ]): for field in page.form.fields: if any([ x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.KEY] ]): if field and field.key: bounding_box_list.append( BoundingBox( geometry=field.key.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.KEY, page_number=page_number)) if any([ x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.VALUE] ]): if field and field.value: bounding_box_list.append( BoundingBox( geometry=field.value.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.VALUE, page_number=page_number)) if any([ x for x in overlay_features if x in [Textract_Types.TABLE, Textract_Types.CELL] ]): for table in page.tables: if Textract_Types.TABLE in overlay_features: bounding_box_list.append( BoundingBox(geometry=table.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.TABLE, page_number=page_number)) if Textract_Types.CELL in overlay_features: for _, row in enumerate(table.rows): for _, cell in enumerate(row.cells): if cell: bounding_box_list.append( BoundingBox( geometry=cell.geometry, document_dimensions=document_dimensions, box_type=Textract_Types.CELL, page_number=page_number)) return bounding_box_list