def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if ext and ext in ["jpg", "jpeg", "png", "pdf"]:
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = "Saved document {} for {}/{}".format(documentId, bucketName,
                                                      objectName)

        print(output)

    return {"statusCode": 200, "body": json.dumps(output)}
예제 #2
0
def processImage(
    documentId, features, bucketName, objectName, outputTableName, documentsTableName
):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(
        bucketName, objectName, detectText, detectForms, detectTables
    )

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(
        documentId, response, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(documentId))

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    # index the searchable pdf in Kendra
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'],
                                   outputBucketName, outputDocumentName,
                                   documentId, fileExtension)

    print("Processed Comprehend data for document: {}".format(documentId))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
예제 #4
0
def processRequest(request):

    output = ""

    print(f"request: {request}")
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    print(f"Input Object: {bucketName}/{objectName}")

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print(f"Extension: {ext}")

    if(ext and ext in ["jpg", "jpeg", "png", "pdf"]):
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = f"Saved document {documentId} for {bucketName}/{objectName}"
        print(output)

    return {
        'statusCode': 200,
        'body': json.dumps(output)
    }
예제 #5
0
def deleteDocument(request):
    print("DeleteDocument request: {}".format(request))

    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.deleteDocument(documentId)
예제 #6
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    outputBucket = request["outputBucket"]
    documentsTable = request["documentsTable"]
    qUrl = request["elasticQueueUrl"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName,
                          detectForms, detectTables, ddb)
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag, jobId)

    jsonMessage = {
        'documentId': jobTag,
        'jobId': jobId,
        'bucketName': outputBucket,
        'objectName': objectName
    }

    client = AwsHelper().getClient('sqs')
    postMessage(client, qUrl, jsonMessage)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
def dataStore_getDocuments():

    #Document
    print("*******************")
    dstore = datastore.DocumentStore(documentsTableName, outputTableName)
    docs = dstore.getDocuments()
    print(docs)
    print("------------")
    while ("nextToken" in docs):
        print(docs["nextToken"])
        docs = dstore.getDocuments(docs["nextToken"])
        print(docs)
    print("------------")
예제 #8
0
def createDocument(request):
    print("CreateDocument request: {}".format(request))

    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    documentId = str(uuid.uuid1())
    ds.createDocument(documentId, bucketName, objectName)

    output = {"documentId": documentId}

    return output
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
 def setUp(self):
     self.conn = boto3.client('dynamodb', region_name=REGION)
     self.conn.create_table(
         TableName=DOCUMENTS_TABLE_NAME,
         KeySchema=[{
             "AttributeName": "documentId",
             "KeyType": "HASH"
         }],
         AttributeDefinitions=[{
             "AttributeName": "documentId",
             "AttributeType": "S"
         }],
         ProvisionedThroughput={
             "ReadCapacityUnits": 5,
             "WriteCapacityUnits": 5
         },
     )
     self.conn.put_item(TableName=DOCUMENTS_TABLE_NAME,
                        Item={
                            "documentId": {
                                "S": "b1a54fda-1809-49d7-8f19-0d1688eb65b9"
                            },
                            "objectName": {
                                "S": "public/samples/Misc/expense.png"
                            },
                            "bucketName": {
                                "S": "dusstack-sample-s3-bucket"
                            },
                            "documentStatus": {
                                "S": "IN_PROGRESS"
                            }
                        })
     self.conn.put_item(TableName=DOCUMENTS_TABLE_NAME,
                        Item={
                            "documentId": {
                                "S": "b1a99fda-1809-49d7-8f19-0d1688eb65b9"
                            },
                            "objectName": {
                                "S": "public/samples/Misc/expense.png"
                            },
                            "bucketName": {
                                "S": "dusstack-sample-s3-bucket"
                            },
                            "documentStatus": {
                                "S": "IN_PROGRESS"
                            }
                        })
     self.ds = datastore.DocumentStore(DOCUMENTS_TABLE_NAME,
                                       OUTPUT_TABLE_NAME)
예제 #11
0
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
def getPageResponse(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page)
        responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName))
        doc["textractResponse"] = responseJson
    output = {}
    if(doc):
        output = doc
    return output
예제 #13
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request["jobId"]
    jobTag = request["jobTag"]
    jobStatus = request["jobStatus"]
    jobAPI = request["jobAPI"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if jobAPI == "StartDocumentAnalysis":
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(
        jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName
    )

    print(output)

    return {"statusCode": 200, "body": output}
def getPageForm(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if(doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        doc["textractResponse"] = parsePairs(file)
    output = {}
    print(output)
    if(doc):
        output = doc
    return output
def getPageTable(request):
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]
    page = request["page"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)
    if (doc and doc["documentStatus"] == "SUCCEEDED"):
        fileName = "{}-analysis/{}/page-{}-tables.csv".format(
            doc["objectName"], doc["documentId"], page)
        file = S3Helper.readFromS3(doc["bucketName"], fileName)
        tables = parseTables(getTableFromString(file))
    output = {"tables": []}
    if (tables):
        output["tables"] = tables
    return output
예제 #16
0
def getDocument(request):
    print("GetDocument request: {}".format(request))

    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    documentId = request["documentId"]

    ds = datastore.DocumentStore(documentsTable, outputTable)
    doc = ds.getDocument(documentId)

    output = {}

    if (doc):
        output = doc

    return output
예제 #17
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    jobId = request["jobId"]
    invocationId = request["invocationId"]
    invocationSchemaVersion = request["invocationSchemaVersion"]
    taskId = request["taskId"]

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if ext and ext in ["jpg", "jpeg", "png", "pdf"]:
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = "Saved document {} for {}/{}".format(documentId, bucketName,
                                                      objectName)

        print(output)

    results = [{
        "taskId":
        taskId,
        "resultCode":
        "Succeeded",
        "resultString":
        "Document submitted for processing with Id: {}".format(documentId),
    }]

    return {
        "invocationSchemaVersion": invocationSchemaVersion,
        "treatMissingKeysAs": "PermanentFailure",
        "invocationId": invocationId,
        "results": results,
    }
예제 #18
0
def processRequest(request):

    output = ""

    print("request: {}".format(request))

    bucketName = request["bucketName"]
    objectName = request["objectName"]
    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]

    jobId = request["jobId"]
    invocationId = request['invocationId']
    invocationSchemaVersion = request['invocationSchemaVersion']
    taskId = request['taskId']

    print("Input Object: {}/{}".format(bucketName, objectName))

    ext = FileHelper.getFileExtenstion(objectName.lower())
    print("Extension: {}".format(ext))

    if (ext and ext in ["jpg", "jpeg", "png", "pdf"]):
        documentId = str(uuid.uuid1())
        ds = datastore.DocumentStore(documentsTable, outputTable)
        ds.createDocument(documentId, bucketName, objectName)

        output = "Saved document {} for {}/{}".format(documentId, bucketName,
                                                      objectName)

        print(output)

    results = [{
        'taskId':
        taskId,
        'resultCode':
        'Succeeded',
        'resultString':
        "Document submitted for processing with Id: {}".format(documentId)
    }]

    return {
        'invocationSchemaVersion': invocationSchemaVersion,
        'treatMissingKeysAs': 'PermanentFailure',
        'invocationId': invocationId,
        'results': results
    }
예제 #19
0
def createDocument(request):
    print("CreateDocument request: {}".format(request))

    documentsTable = request["documentsTable"]
    outputTable = request["outputTable"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    ds = datastore.DocumentStore(documentsTable, outputTable)
    objectRootPrefix = objectName.split('/')[1]
    # if one of the available sample files, backend has to generate UUID.
    if objectRootPrefix == 'samples':
        documentId = generateDocumentID(bucketName)
    else:
        documentId = objectRootPrefix
    ds.createDocument(documentId, bucketName, objectName)
    output = {"documentId": documentId}
    return output
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities in S3
    path = objectName + "-analysis" + "/" + documentId + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
예제 #21
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities
    path = objectName + "-analysis" + "/" + jobTag + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(jobTag))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    return {'statusCode': 200, 'body': output}
예제 #22
0
def processRequest(request):

    output = ""

    print("Request : {}".format(request))

    jobId = request['jobId']
    documentId = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'], bucketName,
                                   outputDocumentName, documentId,
                                   fileExtension)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(documentId)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        documentId, bucketName, objectName)

    return {'statusCode': 200, 'body': output}