def test_words(json_response): doc = Document(json_response) assert 1 == len(doc.pages) lines = [line for line in doc.pages[0].lines] assert 22 == len(lines) words = [word for line in lines for word in line.words] assert 53 == len(words)
def AnalyzeDocument(byteImages): """ Function that analyze a image and return one document params: * image bytearry return: * None: erro in read image * Document: result of image """ f = [] logging.info('Analizando arquivo(s)') for byteImage in byteImages: response = TEXTRACT.analyze_document(Document={'Bytes': byteImage}, FeatureTypes=["FORMS"]) if response['ResponseMetadata']['HTTPStatusCode'] == 200: f.append(Document(response)) else: logging.info('Erro ao Analizar arquivo') return None return f
def lambda_handler(event, context): print("- - - Amazon Textract Demo - - -") # read the bucket name from the event name_of_the_bucket = event['Records'][0]['s3']['bucket']['name'] # read the object from the event name_of_the_doc = event['Records'][0]['s3']['object']['key'] print(name_of_the_bucket) print(name_of_the_doc) # Starts the asynchronous analysis of an input document for relationships between detected items such as key-value pairs, tables, and selection elements. # API ref : https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis # textract_response = textract_client.start_document_text_detection(DocumentLocation={'S3Object': {'Bucket': name_of_the_bucket,'Name': name_of_the_doc}}) response = textract_client.analyze_document( Document={ 'S3Object': { 'Bucket': name_of_the_bucket, 'Name': name_of_the_doc } }, FeatureTypes=["TABLES", "FORMS"]) print(str(response)) doc = Document(response) for page in doc.pages: # Print tables for table in page.tables: for r, row in enumerate(table.rows): for c, cell in enumerate(row.cells): print("Table[{}][{}] = {}".format(r, c, cell.text)) for page in doc.pages: # Print fields print("Fields:") for field in page.form.fields: print("Key: {}, Value: {}".format(field.key, field.value))
def run(): filePath = "temp-response.json" response = json.loads(FileHelper.readFile(filePath)) doc = Document(response) #print(doc) processDocument(doc)
def __init__(self, response, forms, tables, **kwargs): self.response = response self.forms = forms self.tables = tables self.documentId = kwargs.get("documentId", None) self.bucketName = kwargs.get("bucketName", None) self.objectName = kwargs.get("objectName", None) self.outputPath = "{}/ocr-analysis".format(self.objectName) self.document = Document(self.response)
def run(): response = {} filePath = "test-response.json" with open(filePath, 'r') as document: response = json.loads(document.read()) doc = Document(response) processDocument(doc)
def __init__(self, response, bucketName, objectName, tables, metadata): self.response = response self.bucketName = bucketName self.objectName = objectName self.tables = tables self.metadata = metadata self.outputPath = "{}-analysis/".format(objectName, objectName) self.document = Document(self.response)
def analyze(): # Call Amazon Textract response = textract.analyze_document( Document={'S3Object': { 'Bucket': s3BucketName, 'Name': documentName }}, FeatureTypes=["FORMS", "TABLES"]) doc = Document(response) lines(doc)
def __init__(self, documentId, response, bucketName, objectName, forms, tables, ddb): self.documentId = documentId self.response = response self.bucketName = bucketName self.objectName = objectName self.forms = forms self.tables = tables self.ddb = ddb self.outputPath = "{}-analysis/{}/".format(objectName, documentId) self.document = Document(self.response)
def analyze(): response = textract.analyze_document( Document={'S3Object': { 'Bucket': s3BucketName, 'Name': documentName }}, FeatureTypes=["FORMS", "TABLES"]) doc = Document(response) #lines(doc) #forms(doc) tables(doc)
def get_Lines(pgno, data): a, x = data doc = Document(a) w, h = x.size lines = list() for page in doc.pages: tbList = list() for table in page.tables: is_table = True for row in table.rows: if len(row.cells) <= 2: is_table = False if is_table: t = list() for row in table.rows: r = list() dummy = [r.append([cell.text]) for cell in row.cells] t.append(r) tablesList.append((pgno, t)) tbList.append(table.geometry) for line in page._lines: inside = False for rect in tbList: if not (line.geometry.boundingBox.left >= rect.boundingBox.left and line.geometry.boundingBox.top >= rect.boundingBox.top and line.geometry.boundingBox.left + line.geometry.boundingBox.width <= rect.boundingBox.left + rect.boundingBox.width and line.geometry.boundingBox.top + line.geometry.boundingBox.height <= rect.boundingBox.top + rect.boundingBox.height): inside = True if len(tbList) == 1: lines.append(line) inside = False if not inside and len(tbList) > 1: lines.append(line) inside = False elif len(tbList) == 0: lines.append(line) lines = [{ "Text": line.text, "Page": pgno, "Left": line.geometry.boundingBox.left * w, "Top": line.geometry.boundingBox.top * h, "Height": line.geometry.boundingBox.height * h, "Width": line.geometry.boundingBox.width * w } for line in lines] return pd.DataFrame(lines)
def invokeTextract(bucketName, documentKey): print('Loading invokeTextract') # Call Amazon Textract response = textract.analyze_document( Document={'S3Object': { 'Bucket': bucketName, 'Name': documentKey }}, FeatureTypes=["FORMS"]) document = Document(response) return document
def get_tables_from_pdf(s3BucketName,documentName): jobId = startJob(s3BucketName, documentName) print("Started job with id: {}".format(jobId)) status = isJobComplete(jobId) if(status=="SUCCEEDED"): response = getJobResults(jobId) doc = Document(response) csv_tables =[] for page in doc.pages: for table in page.tables: csv_tables.append(generate_csv_from_table(table)) #tables = get_table_responses(response[0]) # Get first item in response return csv_tables elif(status=="FAILED"): return {"message":status}
def get_text_analysis(bucket_name, key): response_analysis = client_text.analyze_document( Document={'S3Object': { 'Bucket': bucket_name, 'Name': key }}, FeatureTypes=['FORMS']) extract = Document(response_analysis) form_ext = [] for page in extract.pages: print("Key Value Pairs:") for headings in page.form.fields: print("Detected Key: {}, Detected Value: {}".format( headings.key, headings.value)) form_ext.append((str(headings.key), str(headings.value))) return form_ext
def s3_handler(record): #process using S3 object response = txt_client.analyze_document(Document={ 'S3Object': { 'Bucket': record['s3']['bucket']['name'], 'Name': record['s3']['object']['key'], } }, FeatureTypes=["TABLES"]) grid_id = os.path.splitext(record['s3']['object']['key'].replace( 'incoming/', ''))[0] #Get the text blocks doc = Document(response) input_matrix = [] for page in doc.pages: # Print tables for table in page.tables: for r, row in enumerate(table.rows): for c, cell in enumerate(row.cells): number = cell.text.replace('NOT_SELECTED,', '').replace( 'SELECTED,', '').replace(' ', '') if number == '': number = 0 try: input_matrix += [int(number)] except: input_matrix += [number] #print("Table[{}][{}] = {}".format(r, c, ttt)) if len(input_matrix) == 81 and all( [isinstance(i, int) for i in input_matrix]): input_matrix = np.matrix(input_matrix).reshape(9, 9) else: dynamodb_table = boto3.resource('dynamodb').Table('sudokuGridRecords') print('Grid not recognized') send_sol_to_db = dynamodb_table.put_item( Item={ 'grid_id': grid_id, 'input': json.dumps(np.array(input_matrix).ravel().tolist()), 'solution': 'Grid could not be read', }) raise Exception(f'Sudoku not detected in picture {grid_id}') return grid_id, input_matrix
def handler(event, context): try: srcbucket = event["Bucket"] srckey = event["Key"] except KeyError as ke: raise MalformedRequest( f"Missing field {ke}, please check your input payload") # Load and parse Textract result from S3 textract_result = json.load(s3.Object(srcbucket, srckey).get()["Body"]) doc = Document(textract_result) # Define post processing variables amount_form_keys = ["total", "amount"] date_form_keys = ["date"] text = "" # Since we're just taking the first line as the vendor name, there'll be exactly one candidate: vendor_name_result = {"Confidence": 0, "Value": ""} vendor_name_candidates = [vendor_name_result] # For the other fields, we'll search for multiple options: date_candidates = [] total_amount_candidates = [] # Receipts don't usually list out a key-value pair like "Vendor: XYZ", the business name is just the # first thing on the receipt! So we'll make that our assumption to extract vendor: for item in textract_result["Blocks"]: if item["BlockType"] == "LINE": if vendor_name_result["Value"] == "": vendor_name_result["Value"] = item["Text"] # Setting the vendor name confidence = raw OCR confidence is a bit lazy and optimistic, # because we're not applying any reduction to reflect the fact that taking first line of # text = vendor name is an *assumption*... But it'll do for our sample: vendor_name_result["Confidence"] = item["Confidence"] else: # While we're looping through blocks anyway, we'll also collect all the text from the receipt # into a single string to search with Comprehend later: text += item["Text"] + " " # For amount and date fields, we'll try searching the key-value pairs first: # TODO: Refactor this loop for efficiency for page in doc.pages: for key in amount_form_keys: fields = page.form.searchFieldsByKey(key) for field in fields: # TODO: This should re-use amount_form_keys if (("total" in field.key.text.lower() or "amount" in field.key.text.lower()) and field.value is not None): try: # If it's the total, the value should be parseable as a number! # TODO: Allow for other leading currency symbols and 3-letter-acronyms a = float(field.value.text.lstrip("$")) total_amount_candidates.append({ # Again because we're post-processing, our output "Confidence" scores should be # driven by the Textract outputs but adjusted to reflect our business # understanding... We'll take another pretty simple choice here: "Confidence": min(field.key.confidence, field.value.confidence), "Value": field.value.text, }) except Exception as e: print("Cannot proceed String to Number {}".format( field.value.text)) for key in date_form_keys: fields = page.form.searchFieldsByKey(key) for field in fields: if "date" in field.key.text.lower( ) and field.value is not None: date_candidates.append({ "Confidence": min(field.key.confidence, field.value.confidence), "Value": field.value.text }) # If we couldn't find any date-looking fields in the key-value pairs (likely for verbose invoice-style # documents, but not for shorrt receipts), then we'll use Amazon Comprehend to just detect date entities: if not len(date_candidates) > 0: comprehend_entities = comprehend.detect_entities( Text=text, LanguageCode="en")["Entities"] for entity in comprehend_entities: if entity.get("Type") == "DATE": value_str = entity.get("Text").strip("\t\n\r") # A little bit of validation that it looks date-like: if "/" in value_str or ":" in value_str or "-" in value_str: date_candidates.append({ # Comprehend scores confidence 0-1 while Textract does 0-100: Doesn't matter which we # standardize on as long as we choose one! Again, could improve this confidence score # by factoring in things like how confident the Textract OCR was on that span of text "Confidence": entity.get("Score", 0) * 100, "Value": value_str }) # Sort our candidates by descending confidence and take the highest confidence candidate for each field: date_candidates = sorted(date_candidates, key=lambda c: c["Confidence"], reverse=True) total_amount_candidates = sorted(total_amount_candidates, key=lambda c: c["Confidence"], reverse=True) date_result = date_candidates[0] if len(date_candidates) else None total_amount_result = total_amount_candidates[0] if len( total_amount_candidates) else None result = { "Date": { "Confidence": date_result["Confidence"] if date_result else 0, "Value": date_result["Value"] if date_result else "", }, "Total": { "Confidence": total_amount_result["Confidence"] if total_amount_result else 0, "Value": total_amount_result["Value"] if total_amount_result else "", }, "Vendor": { "Confidence": vendor_name_result["Confidence"] if vendor_name_result else 0, "Value": vendor_name_result["Value"] if vendor_name_result else "", }, } # How do we measure composite result "Confidence" for many fields driven by different logics? We'll just # take the minimum, since a human review should be triggered by the weakest field. result["Confidence"] = min( map(lambda f: result[f]["Confidence"], result.keys())) if len(date_candidates) > 1: result["Date"]["Alternatives"] = date_candidates[1:] if len(total_amount_candidates) > 1: result["Total"]["Alternatives"] = total_amount_candidates[1:] if len(vendor_name_candidates) > 1: result["Vendor"]["Alternatives"] = vendor_name_candidates[1:] return result
def process_text_analysis(bucket, document): #Get the document from S3 s3_connection = boto3.resource('s3') s3_object = s3_connection.Object(bucket, document) s3_response = s3_object.get() stream = io.BytesIO(s3_response['Body'].read()) image = Image.open(stream) # Analyze the document client = boto3.client('textract') image_binary = stream.getvalue() response = client.analyze_document(Document={'Bytes': image_binary}, FeatureTypes=["TABLES", "FORMS"]) # response = client.start_document_analysis(Document={'Bytes': image_binary}, # FeatureTypes=["TABLES", "FORMS"]) # Alternatively, process using S3 object #response = client.analyze_document( # Document={'S3Object': {'Bucket': bucket, 'Name': document}}, # FeatureTypes=["TABLES", "FORMS"]) doc = Document(response) #Get the text blocks blocks = response['Blocks'] width, height = image.size draw = ImageDraw.Draw(image) print('Detected Document Text') # Create image showing bounding box/polygon the detected lines/text if doc.pages: page = doc.pages[0] for field in page.form.fields: if (field.key and field.value and "social security" in field.key.text.lower()): print("!!!FOUND SOCIAL!!!") x1 = field.value.geometry.boundingBox.left * width y1 = field.value.geometry.boundingBox.top * height - 2 x2 = x1 + (field.value.geometry.boundingBox.width * width) + 5 y2 = y1 + (field.value.geometry.boundingBox.height * height) + 2 draw.rectangle([x1, y1, x2, y2], fill="Black") for block in blocks: DisplayBlockInformation(block) draw = ImageDraw.Draw(image) # if block['BlockType'] == "KEY_VALUE_SET": # if block['EntityTypes'][0] == "KEY": # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red') # else: # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green') # if block['BlockType'] == 'LINE': # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue') # if block['BlockType'] == 'WORD': # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'red') # if 'Confidence' in block: # if block['BlockType'] == 'WORD': # if block['Confidence'] <= 85: # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'red') # elif 85 <= block['Confidence'] <= 98: # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow') # elif block['Confidence'] > 98: # ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'green') for block in blocks: if block['BlockType'] == "KEY_VALUE_SET": if 'KEY' in block['EntityTypes']: ShowBoundingBox(draw, block['Geometry']['BoundingBox'], width, height, 'red') else: ShowBoundingBox(draw, block['Geometry']['BoundingBox'], width, height, 'blue') # uncomment to draw polygon for all Blocks points = [] # for polygon in block['Geometry']['Polygon']: # points.append((width * polygon['X'], height * polygon['Y'])) # draw.polygon((points), outline='blue') # Display the image image.show() return len(blocks)
def runComprehend(bucketName, objectName, callerId): comprehend = AwsHelper().getClient('comprehend') documentId, documentName = dissectObjectName(objectName) assert ( documentId == S3Helper().getTagsS3(bucketName, objectName).get('documentId', None) ), "File path {} does not match the expected documentId tag of the object triggered.".format( objectName) textractOutputJson = json.loads(S3Helper().readFromS3( bucketName, objectName)) og = OutputGenerator(response=textractOutputJson, forms=False, tables=False) pipeline_client.body = { "documentId": documentId, "bucketName": bucketName, "objectName": objectName, "stage": PIPELINE_STAGE } pipeline_client.stageInProgress() document = Document(textractOutputJson) originalFileName = "{}/{}".format(documentId, documentName) comprehendFileName = originalFileName + "/comprehend-output.json" comprehendFileS3Url = "https://{}.s3.amazonaws.com/{}".format( comprehendBucket, urllib.parse.quote_plus(comprehendFileName, safe="/")) tagging = "documentId={}".format(documentId) es.connect() esPayload = [] page_num = 1 for page in document.pages: table = og.structurePageTable(page) forms = og.structurePageForm(page) text = og.structurePageText(page) keyPhrases = [] entitiesDetected = {} lenOfEncodedText = len(text) print("Comprehend documentId {} processing page {}".format( documentId, str(page_num))) print("Length of encoded text is " + str(lenOfEncodedText)) if lenOfEncodedText > COMPREHEND_CHARACTER_LIMIT: print( "Size was too big to run singularly; breaking up the page text into chunks" ) try: chunksOfText = chunkUpTheText(text) except Exception as e: pipeline_client.stageFailed( "Could not determine how to snip the text on page {} into chunks." .format(page_num)) raise (e) keyPhrases, entitiesDetected = batchSendToComprehend( comprehend, chunksOfText, 'en') else: keyPhrases, entitiesDetected = singularSendToComprehend( comprehend, text, 'en') esPageLoad = compileESPayload(es, page_num, keyPhrases, entitiesDetected, text, table, forms, documentId) esPayload.append(esPageLoad) page_num = page_num + 1 try: es.post_bulk(index=esIndex, payload=esPayload) except Exception as e: pipeline_client.stageFailed("Could not post to Elasticsearch") raise (e) print("Data uploaded to ES") try: S3Helper().writeToS3(json.dumps(esPayload), comprehendBucket, comprehendFileName, taggingStr=tagging) except Exception as e: pipeline_client.stageFailed("Failed to write comprehend payload to S3") raise (e) lineage_client.recordLineage({ "documentId": documentId, "callerId": callerId, "sourceBucketName": bucketName, "targetBucketName": comprehendBucket, "sourceFileName": objectName, "targetFileName": comprehendFileName }) pipeline_client.stageSucceeded() print("Comprehend data uploaded to S3 at {}".format(comprehendFileName))
import boto3 from trp import Document # Document documentName = "employmentapp.png" # Amazon Textract client textract = boto3.client('textract') # Call Amazon Textract with open(documentName, "rb") as document: response = textract.analyze_document(Document={ 'Bytes': document.read(), }, FeatureTypes=["TABLES"]) #print(response) doc = Document(response) for page in doc.pages: # Print tables for table in page.tables: for r, row in enumerate(table.rows): for c, cell in enumerate(row.cells): print("Table[{}][{}] = {}".format(r, c, cell.text))
def handler(event, context): print("Received event: " + json.dumps(event, indent=2)) # Get the object from the event and show its content type bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) print("key is" + key) print("bucket is" + bucket) text = "" textvalues = [] textvalues_entity = {} try: s3.Bucket(bucket).download_file(Key=key, Filename='/tmp/{}') # Read document content with open('/tmp/{}', 'rb') as document: imageBytes = bytearray(document.read()) print("Object downloaded") response = textract.analyze_document(Document={'Bytes': imageBytes}, FeatureTypes=["TABLES", "FORMS"]) document = Document(response) table = [] forms = [] #print(document) for page in document.pages: table = outputTable(page) forms = outputForm(page) print(table) blocks = response['Blocks'] for block in blocks: if block['BlockType'] == 'LINE': text += block['Text'] + "\n" print(text) # Extracting Key Phrases keyphrase_response = comprehend.detect_key_phrases(Text=text, LanguageCode='pt') KeyPhraseList = keyphrase_response.get("KeyPhrases") for s in KeyPhraseList: textvalues.append(s.get("Text")) detect_entity = comprehend.detect_entities(Text=text, LanguageCode='pt') EntityList = detect_entity.get("Entities") for s in EntityList: textvalues_entity.update([(s.get("Type").strip('\t\n\r'), s.get("Text").strip('\t\n\r'))]) s3url = 'https://s3.console.aws.amazon.com/s3/object/' + bucket + '/' + key + '?region=' + region searchdata = { 's3link': s3url, 'KeyPhrases': textvalues, 'Entity': textvalues_entity, 'text': text, 'table': table, 'forms': forms } print(searchdata) print("connecting to ES") es = connectES() #es.index(index="resume-search", doc_type="_doc", body=searchdata) es.index(index="document", doc_type="_doc", body=searchdata) print("data uploaded to Elasticsearch") return 'keyphrases Successfully Uploaded' except Exception as e: print(e) print('Error: ') raise e
def test_forms(json_response): doc = Document(json_response) assert 4 == len(doc.pages[0].form.fields)
def test_tables(json_response): doc = Document(json_response) assert 1 == len(doc.pages[0].tables)
def lambda_handler(event, context): """ Get Extraction Status, JobTag and JobId from SNS. If the Status is SUCCEEDED then create a dict of the values and write those to the RDS database. """ #print(event) pattern_caja = [ 'efectivoyvalore', 'efectiv..equiv', 'efectivoyrequiv', 'cajaydisponible', 'efectivoequivalentesalefectivo' 'efectivoybancos', 'vequivalentesdeefectivo', '^efectivo$' ] pattern_ingreso = [ 'totaldeingresosoperacionales', 'totalingresosoperacionales', 'ingresosporventa', "ingresosdeactividadesordinarias", 'ingreso.*ordinaria' ] pattern_patrimonio = [ "^totalpatrimoni.$", "patrimoni.de", "totalcapital", "^patrimonia$", "patrimoni.neto", "patrimoni.total" ] pattern_activos = [ "^totalactivo$", "totalactiv.s$", "totaldeactiv.s$", "totaldelactiv.$", "tolallactiv.$", "activototal$", "activototal$", "activototal$", ] pattern_pasivos = [ "^pasivototal$", "^totalpasivo$", "t.t.lpasiv.s$", "t.t.lpasiv.$", "t.t.ldelp.siv.$" "t.t.ldep.siv.s$" ] pattern_costosventas = [ "costodeventa", "costodeproduccion", "totaldecostos", "gastosdeventas", "gastosdeventa", "costosfinancieros", "costosdeventas" ] pattern_utopera = [ "resultadosporactividadesdeoperacion", "utilidadopera", "utilidaddeoperacion", "gananciaporactividadesdeoperacion", "perdidaopera", "utilidad.*perdida.*opera" ] pattern_ubruta = [ "utilidadbruta", "ganancia.*bruta", "utilidad.*delperiodo", "utilidadporactividadesoperacionales", "margen.*brut", "EBITDA", "utilidad.*antes.*impuesto.*ganancias" ] pattern_antes_imp = ["antes"] pattern_uneta = [ "utilidadneta", "ganancianeta", "Ganancia.*perdida*neto", "utilidadnetaconsolidada", "resultadonetodelano", "resultadointegral" ] mypatterns = { 'p_caja': pattern_caja, 'p_ingreso': pattern_ingreso, 'p_patrimonio': pattern_patrimonio, 'p_activos': pattern_activos, 'p_pasivos': pattern_pasivos, 'p_cventas': pattern_costosventas, 'p_utopera': pattern_utopera, 'p_antesimp': pattern_antes_imp, 'p_uneta': pattern_uneta, 'p_ubruta': pattern_ubruta } notificationMessage = json.loads( json.dumps(event))['Records'][0]['Sns']['Message'] pdfTextExtractionStatus = json.loads(notificationMessage)['Status'] pdfTextExtractionJobTag = json.loads(notificationMessage)['JobTag'] pdfTextExtractionJobId = json.loads(notificationMessage)['JobId'] #print(pdfTextExtractionJobTag + ' : ' + pdfTextExtractionStatus) try: if (pdfTextExtractionStatus == 'SUCCEEDED'): response = getJobResultsAllinOne(pdfTextExtractionJobId) doc = Document(response) except: if (pdfTextExtractionStatus == 'SUCCEEDED'): response = getJobResults(pdfTextExtractionJobId) doc = Document(response) response = response[0] doc_key = pdfTextExtractionJobTag[:-4] query = make_query(doc, response, doc_key, mypatterns) connection = get_connection() cursor = connection.cursor() cursor.execute(query) connection.commit() cursor.close()
def ocr(self, filename): # Call Amazon Textract and extract like table self.response = self.textract.analyze_document( Document={ 'S3Object': { 'Bucket': 'bucketname', # give bucket name 'Name': filename # give key name of file present in S3 } }, FeatureTypes=["TABLES"]) # table format return self.response tex_connect_obj = textract_connect() tex_connect_obj.upload_s3() response = tex_connect_obj.ocr('keyname') doc = Document(response) #after extracting, making it as document for page in doc.pages: # print(type(page)) for table in page.tables: # print(table.rows) for r, row in enumerate(table.rows): # for row in table.rows: # print(row) for c, cell1 in enumerate(row.cells): # print(cell) print("Table[{}][{}] = {}".format(r, c, cell1.text))
import random import boto3 from trp import Document bucket = "textract-acord-poc" filename = "Multipage_test.pdf" client = boto3.client('textract') response = client.start_document_analysis( DocumentLocation={'S3Object': { 'Bucket': bucket, 'Name': filename }}, FeatureTypes=["TABLES", "FORMS"]) import pdb pdb.set_trace() print(response) res = client.get_document_analysis(JobId=response["JobId"]) doc = Document(res) print(doc)
doco_for_scan = "Below.jpg" # Setup Boto3 client # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html textract = boto3.client('textract', region_name="us-east-1") response = textract.analyze_document( Document={'S3Object': { 'Bucket': Bucket, 'Name': doco_for_scan }}, FeatureTypes=["FORMS"]) # 'TABLES'|'FORMS' extract = Document(response) # Start looping for page in extract.pages: print("Key Value Pairs:") for headings in page.form.fields: print("Detected Key: {}, Detected Value: {}".format( headings.key, headings.value)) # Amazon Translate client translate = boto3.client('translate') # Use the same response to translate print('Lets Translate using AWS translate') for item in response["Blocks"]:
def test_custom_page_orientation(json_response): doc = Document(json_response) assert 1 == len(doc.pages) lines = [line for line in doc.pages[0].lines] assert 22 == len(lines) words = [word for line in lines for word in line.words] assert 53 == len(words) t_document: t2.TDocument = t2.TDocumentSchema().load(json_response) t_document.custom = {'orientation': 180} new_t_doc_json = t2.TDocumentSchema().dump(t_document) assert "Custom" in new_t_doc_json assert "orientation" in new_t_doc_json["Custom"] assert new_t_doc_json["Custom"]["orientation"] == 180 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -1 < t_document.pages[0].custom['Orientation'] < 2 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib_10_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 5 < t_document.pages[0].custom['Orientation'] < 15 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__15_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 10 < t_document.pages[0].custom['Orientation'] < 20 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__25_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 17 < t_document.pages[0].custom['Orientation'] < 30 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__180_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 170 < t_document.pages[0].custom['Orientation'] < 190 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__270_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -100 < t_document.pages[0].custom['Orientation'] < -80 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__90_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert 80 < t_document.pages[0].custom['Orientation'] < 100 p = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(p, "data/gib__minus_10_degrees.json")) j = json.load(f) t_document: t2.TDocument = t2.TDocumentSchema().load(j) t_document = add_page_orientation(t_document) assert -10 < t_document.pages[0].custom['Orientation'] < 5 doc = t1.Document(t2.TDocumentSchema().dump(t_document)) for page in doc.pages: assert page.custom['Orientation']