def processRequest(request): output = "" print("request: {}".format(request)) bucketName = request["bucketName"] objectName = request["objectName"] documentsTable = request["documentsTable"] outputTable = request["outputTable"] print("Input Object: {}/{}".format(bucketName, objectName)) ext = FileHelper.getFileExtenstion(objectName.lower()) print("Extension: {}".format(ext)) if ext and ext in ["jpg", "jpeg", "png", "pdf"]: documentId = str(uuid.uuid1()) ds = datastore.DocumentStore(documentsTable, outputTable) ds.createDocument(documentId, bucketName, objectName) output = "Saved document {} for {}/{}".format(documentId, bucketName, objectName) print(output) return {"statusCode": 200, "body": json.dumps(output)}
def processImage( documentId, features, bucketName, objectName, outputTableName, documentsTableName ): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract( bucketName, objectName, detectText, detectForms, detectTables ) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator( documentId, response, bucketName, objectName, detectForms, detectTables, ddb ) opg.run() print("DocumentId: {}".format(documentId)) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document # index the searchable pdf in Kendra if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], outputBucketName, outputDocumentName, documentId, fileExtension) print("Processed Comprehend data for document: {}".format(documentId)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processRequest(request): output = "" print(f"request: {request}") bucketName = request["bucketName"] objectName = request["objectName"] documentsTable = request["documentsTable"] outputTable = request["outputTable"] print(f"Input Object: {bucketName}/{objectName}") ext = FileHelper.getFileExtenstion(objectName.lower()) print(f"Extension: {ext}") if(ext and ext in ["jpg", "jpeg", "png", "pdf"]): documentId = str(uuid.uuid1()) ds = datastore.DocumentStore(documentsTable, outputTable) ds.createDocument(documentId, bucketName, objectName) output = f"Saved document {documentId} for {bucketName}/{objectName}" print(output) return { 'statusCode': 200, 'body': json.dumps(output) }
def deleteDocument(request): print("DeleteDocument request: {}".format(request)) documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] ds = datastore.DocumentStore(documentsTable, outputTable) ds.deleteDocument(documentId)
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] objectName = request['objectName'] outputTable = request["outputTable"] outputBucket = request["outputBucket"] documentsTable = request["documentsTable"] qUrl = request["elasticQueueUrl"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName, detectForms, detectTables, ddb) opg.run() print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag, jobId) jsonMessage = { 'documentId': jobTag, 'jobId': jobId, 'bucketName': outputBucket, 'objectName': objectName } client = AwsHelper().getClient('sqs') postMessage(client, qUrl, jsonMessage) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) print(output) return {'statusCode': 200, 'body': output}
def dataStore_getDocuments(): #Document print("*******************") dstore = datastore.DocumentStore(documentsTableName, outputTableName) docs = dstore.getDocuments() print(docs) print("------------") while ("nextToken" in docs): print(docs["nextToken"]) docs = dstore.getDocuments(docs["nextToken"]) print(docs) print("------------")
def createDocument(request): print("CreateDocument request: {}".format(request)) documentsTable = request["documentsTable"] outputTable = request["outputTable"] bucketName = request["bucketName"] objectName = request["objectName"] ds = datastore.DocumentStore(documentsTable, outputTable) documentId = str(uuid.uuid1()) ds.createDocument(documentId, bucketName, objectName) output = {"documentId": documentId} return output
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) opg = OutputGenerator(jobTag, pages, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg.run() generatePdf(jobTag, bucketName, objectName, outputBucketName) print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) print(output) return {'statusCode': 200, 'body': output}
def setUp(self): self.conn = boto3.client('dynamodb', region_name=REGION) self.conn.create_table( TableName=DOCUMENTS_TABLE_NAME, KeySchema=[{ "AttributeName": "documentId", "KeyType": "HASH" }], AttributeDefinitions=[{ "AttributeName": "documentId", "AttributeType": "S" }], ProvisionedThroughput={ "ReadCapacityUnits": 5, "WriteCapacityUnits": 5 }, ) self.conn.put_item(TableName=DOCUMENTS_TABLE_NAME, Item={ "documentId": { "S": "b1a54fda-1809-49d7-8f19-0d1688eb65b9" }, "objectName": { "S": "public/samples/Misc/expense.png" }, "bucketName": { "S": "dusstack-sample-s3-bucket" }, "documentStatus": { "S": "IN_PROGRESS" } }) self.conn.put_item(TableName=DOCUMENTS_TABLE_NAME, Item={ "documentId": { "S": "b1a99fda-1809-49d7-8f19-0d1688eb65b9" }, "objectName": { "S": "public/samples/Misc/expense.png" }, "bucketName": { "S": "dusstack-sample-s3-bucket" }, "documentStatus": { "S": "IN_PROGRESS" } }) self.ds = datastore.DocumentStore(DOCUMENTS_TABLE_NAME, OUTPUT_TABLE_NAME)
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def getPageResponse(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page) responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName)) doc["textractResponse"] = responseJson output = {} if(doc): output = doc return output
def processRequest(request): output = "" print(request) jobId = request["jobId"] jobTag = request["jobTag"] jobStatus = request["jobStatus"] jobAPI = request["jobAPI"] bucketName = request["bucketName"] objectName = request["objectName"] outputTable = request["outputTable"] documentsTable = request["documentsTable"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if jobAPI == "StartDocumentAnalysis": detectForms = True detectTables = True dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) opg = OutputGenerator( jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb ) opg.run() print("DocumentId: {}".format(jobTag)) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName ) print(output) return {"statusCode": 200, "body": output}
def getPageForm(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) doc["textractResponse"] = parsePairs(file) output = {} print(output) if(doc): output = doc return output
def getPageTable(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if (doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-tables.csv".format( doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) tables = parseTables(getTableFromString(file)) output = {"tables": []} if (tables): output["tables"] = tables return output
def getDocument(request): print("GetDocument request: {}".format(request)) documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) output = {} if (doc): output = doc return output
def processRequest(request): output = "" print("request: {}".format(request)) bucketName = request["bucketName"] objectName = request["objectName"] documentsTable = request["documentsTable"] outputTable = request["outputTable"] jobId = request["jobId"] invocationId = request["invocationId"] invocationSchemaVersion = request["invocationSchemaVersion"] taskId = request["taskId"] print("Input Object: {}/{}".format(bucketName, objectName)) ext = FileHelper.getFileExtenstion(objectName.lower()) print("Extension: {}".format(ext)) if ext and ext in ["jpg", "jpeg", "png", "pdf"]: documentId = str(uuid.uuid1()) ds = datastore.DocumentStore(documentsTable, outputTable) ds.createDocument(documentId, bucketName, objectName) output = "Saved document {} for {}/{}".format(documentId, bucketName, objectName) print(output) results = [{ "taskId": taskId, "resultCode": "Succeeded", "resultString": "Document submitted for processing with Id: {}".format(documentId), }] return { "invocationSchemaVersion": invocationSchemaVersion, "treatMissingKeysAs": "PermanentFailure", "invocationId": invocationId, "results": results, }
def processRequest(request): output = "" print("request: {}".format(request)) bucketName = request["bucketName"] objectName = request["objectName"] documentsTable = request["documentsTable"] outputTable = request["outputTable"] jobId = request["jobId"] invocationId = request['invocationId'] invocationSchemaVersion = request['invocationSchemaVersion'] taskId = request['taskId'] print("Input Object: {}/{}".format(bucketName, objectName)) ext = FileHelper.getFileExtenstion(objectName.lower()) print("Extension: {}".format(ext)) if (ext and ext in ["jpg", "jpeg", "png", "pdf"]): documentId = str(uuid.uuid1()) ds = datastore.DocumentStore(documentsTable, outputTable) ds.createDocument(documentId, bucketName, objectName) output = "Saved document {} for {}/{}".format(documentId, bucketName, objectName) print(output) results = [{ 'taskId': taskId, 'resultCode': 'Succeeded', 'resultString': "Document submitted for processing with Id: {}".format(documentId) }] return { 'invocationSchemaVersion': invocationSchemaVersion, 'treatMissingKeysAs': 'PermanentFailure', 'invocationId': invocationId, 'results': results }
def createDocument(request): print("CreateDocument request: {}".format(request)) documentsTable = request["documentsTable"] outputTable = request["outputTable"] bucketName = request["bucketName"] objectName = request["objectName"] ds = datastore.DocumentStore(documentsTable, outputTable) objectRootPrefix = objectName.split('/')[1] # if one of the available sample files, backend has to generate UUID. if objectRootPrefix == 'samples': documentId = generateDocumentID(bucketName) else: documentId = objectRootPrefix ds.createDocument(documentId, bucketName, objectName) output = {"documentId": documentId} return output
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities in S3 path = objectName + "-analysis" + "/" + documentId + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) opg = OutputGenerator(jobTag, pages, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(jobTag, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities path = objectName + "-analysis" + "/" + jobTag + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(jobTag)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) return {'statusCode': 200, 'body': output}
def processRequest(request): output = "" print("Request : {}".format(request)) jobId = request['jobId'] documentId = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, pages, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], bucketName, outputDocumentName, documentId, fileExtension) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(documentId) output = "Processed -> Document: {}, Object: {}/{} processed.".format( documentId, bucketName, objectName) return {'statusCode': 200, 'body': output}