def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document # index the searchable pdf in Kendra if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], outputBucketName, outputDocumentName, documentId, fileExtension) print("Processed Comprehend data for document: {}".format(documentId)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities in S3 comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName, objectName, outputTableName, documentsTableName, elasticsearchDomain): detectText = "Text" in features detectForms = "Forms" in features detectTables = "Tables" in features response = callTextract(bucketName, objectName, detectText, detectForms, detectTables) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTableName) print("Generating output for DocumentId: {}".format(documentId)) opg = OutputGenerator(documentId, response, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities in S3 path = objectName + "-analysis" + "/" + documentId + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTableName, outputTableName) ds.markDocumentComplete(documentId)
def processRequest(request): output = "" print(request) jobId = request['jobId'] jobTag = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) opg = OutputGenerator(jobTag, pages, outputBucketName, objectName, detectForms, detectTables, ddb, elasticsearchDomain) opg_output = opg.run() generatePdf(jobTag, bucketName, objectName, outputBucketName) # generate Comprehend and ComprehendMedical entities path = objectName + "-analysis" + "/" + jobTag + "/" print("path: " + path) maxPages = 100 comprehendClient = ComprehendHelper() comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, 'response.json', path, maxPages) print("DocumentId: {}".format(jobTag)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(jobTag) output = "Processed -> Document: {}, Object: {}/{} processed.".format( jobTag, bucketName, objectName) return {'statusCode': 200, 'body': output}
def processRequest(request): output = "" print("Request : {}".format(request)) jobId = request['jobId'] documentId = request['jobTag'] jobStatus = request['jobStatus'] jobAPI = request['jobAPI'] bucketName = request['bucketName'] outputBucketName = request['outputBucketName'] objectName = request['objectName'] outputTable = request["outputTable"] documentsTable = request["documentsTable"] elasticsearchDomain = request["elasticsearchDomain"] pages = getJobResults(jobAPI, jobId) print("Result pages recieved: {}".format(len(pages))) dynamodb = AwsHelper().getResource("dynamodb") ddb = dynamodb.Table(outputTable) detectForms = False detectTables = False if (jobAPI == "StartDocumentAnalysis"): detectForms = True detectTables = True dynamodb = AwsHelper().getResource('dynamodb') ddb = dynamodb.Table(outputTable) outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId, SERVICE_OUTPUT_PATH_S3_PREFIX) print("Generating output for DocumentId: {} and storing in {}".format( documentId, outputPath)) opg = OutputGenerator(documentId, pages, outputBucketName, objectName, detectForms, detectTables, ddb, outputPath, elasticsearchDomain) opg_output = opg.run() generatePdf(documentId, bucketName, objectName, outputBucketName, outputPath) # generate Comprehend and ComprehendMedical entities comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX) print("Comprehend output path: " + comprehendOutputPath) maxPages = 100 comprehendClient = ComprehendHelper() responseDocumentName = "{}{}response.json".format(outputPath, TEXTRACT_PATH_S3_PREFIX) comprehendAndMedicalEntities = comprehendClient.processComprehend( outputBucketName, responseDocumentName, comprehendOutputPath, maxPages) # if Kendra is available then let it index the document if 'KENDRA_INDEX_ID' in os.environ: kendraClient = KendraHelper() fileName = os.path.basename(objectName).split(".")[0] fileExtension = os.path.basename(objectName).split(".")[1] outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName) kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'], os.environ['KENDRA_ROLE_ARN'], bucketName, outputDocumentName, documentId, fileExtension) print("DocumentId: {}".format(documentId)) print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities)) # index document once the comprehend entities and KVPairs have been extracted for key, val in opg_output[KVPAIRS].items(): if key not in comprehendAndMedicalEntities: comprehendAndMedicalEntities[key] = val else: comprehendAndMedicalEntities[key].add(val) opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities) ds = datastore.DocumentStore(documentsTable, outputTable) ds.markDocumentComplete(documentId) output = "Processed -> Document: {}, Object: {}/{} processed.".format( documentId, bucketName, objectName) return {'statusCode': 200, 'body': output}