def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
def processRequest(request):

    output = ""

    print(request)

    jobId = request["jobId"]
    jobTag = request["jobTag"]
    jobStatus = request["jobStatus"]
    jobAPI = request["jobAPI"]
    bucketName = request["bucketName"]
    objectName = request["objectName"]
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if jobAPI == "StartDocumentAnalysis":
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(
        jobTag, pages, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName
    )

    print(output)

    return {"statusCode": 200, "body": output}
Exemplo n.º 3
0
    def createDocument(self, documentId, bucketName, objectName):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key = { "documentId": documentId },
                UpdateExpression = 'SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue',
                ConditionExpression = 'attribute_not_exists(documentId)',
                ExpressionAttributeValues = {
                    ':bucketNameValue': bucketName,
                    ':objectNameValue': objectName,
                    ':documentstatusValue': 'IN_PROGRESS',
                    ':documentCreatedOnValue': str(datetime.datetime.utcnow())
                }
            )
        except ClientError as e:
            print(e)
            if e.response['Error']['Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err  = {'Error' : 'Document already exist.'}
            else:
                raise

        return err
Exemplo n.º 4
0
    def queryByIndexBothKeys(self, indexPartitionKey, indexSortKey):
        """List the data from database based on index partition key and index sort key
        
        Args:
           indexPartitionKey(str): partition key value of index
           indexSortKey(str): sort key value of index

        Returns:
            List of data from database based on partition key and sort key
        """
        response = {'Items': []}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            try:
                response = table.query(
                    IndexName=self._indexName,
                    KeyConditionExpression=Key(
                        self._indexPartitionKeyName).eq(indexPartitionKey)
                    & Key(self._indexSortKeyName).eq(indexSortKey))
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response['Items']
    def getDocumentCount(self):

        dynamodb = AwsHelper().getResource("dynamodb")

        table = dynamodb.Table(self._documentsTableName)

        return table.item_count
Exemplo n.º 6
0
    def save(self, info):
        """Store the data into database
        
        Args:
            info(dict): information to store

        Returns:
            None
        """
        response = {'status': 'OK'}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            for key in info:
                if not info[key]:
                    response['status'] = 'BAD'
                    response['error'] = key + ' should not be empty.'
                    print(response['error'])
                    return response
            try:
                table.put_item(Item=info)
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response
Exemplo n.º 7
0
def processImage(
    documentId, features, bucketName, objectName, outputTableName, documentsTableName
):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(
        bucketName, objectName, detectText, detectForms, detectTables
    )

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(
        documentId, response, bucketName, objectName, detectForms, detectTables, ddb
    )
    opg.run()

    print("DocumentId: {}".format(documentId))

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Exemplo n.º 8
0
    def createDocument(self, documentId, bucketName, objectName):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={"documentId": documentId},
                UpdateExpression=
                "SET bucketName = :bucketNameValue, objectName = :objectNameValue, documentStatus = :documentstatusValue, documentCreatedOn = :documentCreatedOnValue",
                ConditionExpression="attribute_not_exists(documentId)",
                ExpressionAttributeValues={
                    ":bucketNameValue": bucketName,
                    ":objectNameValue": objectName,
                    ":documentstatusValue": "IN_PROGRESS",
                    ":documentCreatedOnValue": str(datetime.datetime.utcnow()),
                },
            )
        except ClientError as e:
            print(e)
            if e.response["Error"][
                    "Code"] == "ConditionalCheckFailedException":
                print(e.response["Error"]["Message"])
                err = {"Error": "Document already exist."}
            else:
                raise

        return err
Exemplo n.º 9
0
    def markDocumentComplete(self, documentId):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={"documentId": documentId},
                UpdateExpression=
                "SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue",
                ConditionExpression="attribute_exists(documentId)",
                ExpressionAttributeValues={
                    ":documentstatusValue": "SUCCEEDED",
                    ":documentCompletedOnValue":
                    str(datetime.datetime.utcnow()),
                },
            )
        except ClientError as e:
            if e.response["Error"][
                    "Code"] == "ConditionalCheckFailedException":
                print(e.response["Error"]["Message"])
                err = {"Error": "Document does not exist."}
            else:
                raise

        return err
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    # index the searchable pdf in Kendra
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'],
                                   outputBucketName, outputDocumentName,
                                   documentId, fileExtension)

    print("Processed Comprehend data for document: {}".format(documentId))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Exemplo n.º 11
0
    def getDocuments(self, nextToken=None):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)

        pageSize = 25

        if(nextToken):
            response = table.scan(ExclusiveStartKey={ "documentId" : nextToken}, Limit=pageSize)
        else:
            response = table.scan(Limit=pageSize)

        print("response: {}".format(response))

        data = []

        if('Items' in response):        
            data = response['Items']

        documents = { 
            "documents" : data
        }

        if 'LastEvaluatedKey' in response:
            nextToken = response['LastEvaluatedKey']['documentId']
            print("nexToken: {}".format(nextToken))
            documents["nextToken"] = nextToken

        return documents
Exemplo n.º 12
0
    def markDocumentComplete(self, documentId):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key = { 'documentId': documentId },
                UpdateExpression = 'SET documentStatus= :documentstatusValue, documentCompletedOn = :documentCompletedOnValue',
                ConditionExpression = 'attribute_exists(documentId)',
                ExpressionAttributeValues = {
                    ':documentstatusValue': "SUCCEEDED",
                    ':documentCompletedOnValue': str(datetime.datetime.utcnow())
                }
            )
        except ClientError as e:
            if e.response['Error']['Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err  = {'Error' : 'Document does not exist.'}
            else:
                raise

        return err
Exemplo n.º 13
0
    def updateDocumentStatus(self, documentId, documentStatus, jobId=None):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._documentsTableName)

        try:
            table.update_item(
                Key={'documentId': documentId},
                UpdateExpression=
                'SET documentStatus= :documentstatusValue, jobId= :jobIdValue',
                ConditionExpression='attribute_exists(documentId)',
                ExpressionAttributeValues={
                    ':documentstatusValue': documentStatus,
                    ':jobIdValue': jobId
                })
        except ClientError as e:
            if e.response['Error'][
                    'Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err = {'Error': 'Document does not exist.'}
            else:
                raise

        return err
Exemplo n.º 14
0
    def deleteDocument(self, documentId):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)

        table.delete_item(
            Key={
                'documentId': documentId
            }
        )
    def deleteItem(self, itemId):

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._itemsTableName)

        table.delete_item(
            Key={
                'itemId': itemId
            }
        )
Exemplo n.º 16
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    outputBucket = request["outputBucket"]
    documentsTable = request["documentsTable"]
    qUrl = request["elasticQueueUrl"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    opg = OutputGenerator(jobId, jobTag, pages, outputBucket, objectName,
                          detectForms, detectTables, ddb)
    opg.run()

    print("DocumentId: {}".format(jobTag))

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag, jobId)

    jsonMessage = {
        'documentId': jobTag,
        'jobId': jobId,
        'bucketName': outputBucket,
        'objectName': objectName
    }

    client = AwsHelper().getClient('sqs')
    postMessage(client, qUrl, jsonMessage)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    print(output)

    return {'statusCode': 200, 'body': output}
Exemplo n.º 17
0
    def updateDocumentStatus(self, documentId, status, stage, timestamp, message=None):

        ret = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)
        try:
            if message:
                new_datapoint = {
                    "timestamp": timestamp,
                    "stage": stage,
                    "status": status,
                    "message": message
                }
            else:
                new_datapoint = {
                    "timestamp": timestamp,
                    "stage": stage,
                    "status": status
                }
            table.update_item(
                Key = {
                    'documentId': documentId
                },
                UpdateExpression = 'SET documentStatus = :documentStatus, documentStage = :documentStage, lastUpdate = :lastUpdate, timeline = list_append(timeline, :new_datapoint)',
                ConditionExpression = 'attribute_exists(documentId)',
                ExpressionAttributeValues = {
                    ':documentStatus': status,
                    ':documentStage': stage,
                    ':lastUpdate': timestamp,
                    ':new_datapoint': [new_datapoint]
                }
            )
            ret = {
                'Status': 200
            }
        except ClientError as e:
            print(e)
            ret  = {
                'Error' : e.response['Error']['Message'],
                'Status': e.response['ResponseMetadata']['HTTPStatusCode']
            }
        except Exception as e:
            print(e)
            ret = {
                'Error' : 'Updating document failed',
                'Status': 400
            }

        return ret
Exemplo n.º 18
0
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities in S3
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Exemplo n.º 19
0
 def queryDocumentId(self, targetBucketName, targetFileName, versionId=None):
     ret = None
     res = None
     
     dynamodb = AwsHelper().getResource("dynamodb")
     table = dynamodb.Table(self._lineageTableName)
     documentSignature = "BUCKET:{}@FILE:{}".format(targetBucketName, targetFileName)
     if versionId:
         documentSignature += "@VERSION:{}".format(versionId)
     try:
         res = table.query(
             KeyConditionExpression = Key('documentSignature').eq(documentSignature),
             IndexName = self._lineageIndexName
         )
     except ClientError as e:
         print(e)
         ret = {
             'Error': e.response['Error']['Message'],
             'Status': e.response['ResponseMetadata']['HTTPStatusCode']
         }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Unknown error occurred during querying the document Id',
             'Status': 400
         }
     try:
         items = res['Items']
         print(items)
         if len(items) == 0:
             ret = {
                 'Status': 404,
                 'documentId': None
             }
         else:
             items.sort(key=lambda item: datetime.fromisoformat(item['timestamp']))
             ret = {
                 'Status': 200,
                 'documentId': items[0]['documentId']
             }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Could not find the documentId for specified document Signature',
             'Status': 400
         }
         
     return ret
Exemplo n.º 20
0
 def createLineage(self, documentId, callerId, targetBucketName, targetFileName, timestamp, s3Event, sourceBucketName=None, sourceFileName=None, versionId=None):
     ret = None
     
     dynamodb = AwsHelper().getResource("dynamodb")
     table = dynamodb.Table(self._lineageTableName)
     documentSignature = "BUCKET:{}@FILE:{}".format(targetBucketName, targetFileName)
     if versionId:
         documentSignature += "@VERSION:{}".format(versionId)
     item = {
         "documentId": documentId,
         "documentSignature": documentSignature,
         "callerId": callerId,
         "targetBucketName": targetBucketName,
         "targetFileName": targetFileName,
         "timestamp": timestamp,
         "s3Event": s3Event
     }
     if versionId:
         item['versionId'] = versionId
     if sourceFileName:
         item['sourceFileName'] = sourceFileName
     if sourceBucketName:
         item['sourceBucketName'] = sourceBucketName
     try:
         table.put_item(
             Item = item
         )
         ret = {
             'Status': 200
         }
     except ClientError as e:
         print(e)
         ret = {
             'Error': e.response['Error']['Message'],
             'Status': e.response['ResponseMetadata']['HTTPStatusCode']
         }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Unknown error occurred during updating document',
             'Status': 400
         }
     return ret
def processImage(documentId, features, bucketName, outputBucketName,
                 objectName, outputTableName, documentsTableName,
                 elasticsearchDomain):

    detectText = "Text" in features
    detectForms = "Forms" in features
    detectTables = "Tables" in features

    response = callTextract(bucketName, objectName, detectText, detectForms,
                            detectTables)

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTableName)

    print("Generating output for DocumentId: {}".format(documentId))

    opg = OutputGenerator(documentId, response, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities in S3
    path = objectName + "-analysis" + "/" + documentId + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTableName, outputTableName)
    ds.markDocumentComplete(documentId)
Exemplo n.º 22
0
    def startDocumentTracking(self, documentId, bucketName, objectName, status, stage, timestamp, versionId=None):

        ret = None
        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._opsTableName)
        item = {
            "documentId": documentId,
            "bucketName": bucketName,
            "objectName": objectName,
            "documentStatus": status,
            "documentStage": stage,
            "lastUpdate": timestamp,
            "timeline": [{
                "timestamp": timestamp,
                "stage": stage,
                "status": status
            }]
        }
        if versionId:
            item['documentVersion'] = versionId
        try:
            table.put_item(
                ConditionExpression = "attribute_not_exists(documentId)",
                Item = item
            )
            ret = {
                'Status': 200
            }
        except ClientError as e:
            print(e)
            ret  = {
                'Status': e.response['ResponseMetadata']['HTTPStatusCode'],
                'Error': e.response['Error']['Message']
            }
        except Exception as e:
            print(e)
            ret = {
                'Error': 'Unknown error occurred during updating document',
                'Status': 400
            }
        return ret
Exemplo n.º 23
0
    def listItems(self):
        """List the data from database
        
        Args:
            None

        Returns:
            List of data from data base
        """
        response = {'Items': []}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            try:
                response = table.scan()
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response['Items']
Exemplo n.º 24
0
    def queryByPartitionKey(self, partitionKey):
        """List the data from database based on partition key
        
        Args:
           partitionKey(str): partition key value

        Returns:
            List of data from database based on partition key
        """
        response = {'Items': []}
        if self._databaseName == 'dynamodb':
            dynamodb = AwsHelper().getResource(self._databaseName,
                                               self._awsRegion)
            table = dynamodb.Table(self._tableName)
            try:
                response = table.query(KeyConditionExpression=Key(
                    self._partitionKeyName).eq(partitionKey))
            except ParamValidationError as e:
                print("Parameter validation error: %s" % e)
            except ClientError as e:
                print("Unexpected error: %s" % e)
        return response['Items']
Exemplo n.º 25
0
 def registerDocument(self, documentId, bucketName, documentName, documentLink, principalIAMWriter, timestamp, documentMetadata, documentVersion=None):
     ret = None
     
     dynamodb = AwsHelper().getResource("dynamodb")
     table = dynamodb.Table(self._registryTableName)
     item = {
         "documentId": documentId,
         "principalIAMWriter": principalIAMWriter,
         "bucketName": bucketName,
         "documentName": documentName,
         "documentLink": documentLink,
         "documentMetadata": documentMetadata,
         "timestamp": timestamp
     }
     if documentVersion:
         item['documentVersion'] = documentVersion
     try:
         table.put_item(
             ConditionExpression = "attribute_not_exists(documentId)",
             Item = item
         )
         ret = {
             'Status': 200
         }
     except ClientError as e:
         print(e)
         ret = {
             'Error': e.response['Error']['Message'],
             'Status': e.response['ResponseMetadata']['HTTPStatusCode']
         }
     except Exception as e:
         print(e)
         ret = {
             'Error': 'Unknown error occurred during updating document',
             'Status': 400
         }
     return ret
    def updateItemStatus(self, itemId, itemStatus):

        err = None

        dynamodb = AwsHelper().getResource("dynamodb")
        table = dynamodb.Table(self._itemsTableName)

        try:
            table.update_item(
                Key = { 'itemId': itemId },
                UpdateExpression = 'SET itemStatus= :itemstatusValue',
                ConditionExpression = 'attribute_exists(itemId)',
                ExpressionAttributeValues = {
                    ':itemstatusValue': itemStatus
                }
            )
        except ClientError as e:
            if e.response['Error']['Code'] == "ConditionalCheckFailedException":
                print(e.response['Error']['Message'])
                err  = {'Error' : 'Item does not exist.'}
            else:
                raise

        return err
Exemplo n.º 27
0
def processRequest(request):

    output = ""

    print("Request : {}".format(request))

    jobId = request['jobId']
    documentId = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX, documentId,
                                  SERVICE_OUTPUT_PATH_S3_PREFIX)
    print("Generating output for DocumentId: {} and storing in {}".format(
        documentId, outputPath))

    opg = OutputGenerator(documentId, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, outputPath,
                          elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(documentId, bucketName, objectName, outputBucketName,
                outputPath)

    # generate Comprehend and ComprehendMedical entities
    comprehendOutputPath = "{}{}".format(outputPath, COMPREHEND_PATH_S3_PREFIX)
    print("Comprehend output path: " + comprehendOutputPath)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    responseDocumentName = "{}{}response.json".format(outputPath,
                                                      TEXTRACT_PATH_S3_PREFIX)
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)

    # if Kendra is available then let it index the document
    if 'KENDRA_INDEX_ID' in os.environ:
        kendraClient = KendraHelper()
        fileName = os.path.basename(objectName).split(".")[0]
        fileExtension = os.path.basename(objectName).split(".")[1]
        outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
        kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
                                   os.environ['KENDRA_ROLE_ARN'], bucketName,
                                   outputDocumentName, documentId,
                                   fileExtension)

    print("DocumentId: {}".format(documentId))
    print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(documentId)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        documentId, bucketName, objectName)

    return {'statusCode': 200, 'body': output}
Exemplo n.º 28
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    outputBucketName = request['outputBucketName']
    objectName = request['objectName']
    outputTable = request["outputTable"]
    documentsTable = request["documentsTable"]
    elasticsearchDomain = request["elasticsearchDomain"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages recieved: {}".format(len(pages)))

    dynamodb = AwsHelper().getResource("dynamodb")
    ddb = dynamodb.Table(outputTable)

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    dynamodb = AwsHelper().getResource('dynamodb')
    ddb = dynamodb.Table(outputTable)

    opg = OutputGenerator(jobTag, pages, outputBucketName, objectName,
                          detectForms, detectTables, ddb, elasticsearchDomain)
    opg_output = opg.run()

    generatePdf(jobTag, bucketName, objectName, outputBucketName)

    # generate Comprehend and ComprehendMedical entities
    path = objectName + "-analysis" + "/" + jobTag + "/"
    print("path: " + path)
    maxPages = 100
    comprehendClient = ComprehendHelper()
    comprehendAndMedicalEntities = comprehendClient.processComprehend(
        outputBucketName, 'response.json', path, maxPages)

    print("DocumentId: {}".format(jobTag))

    # index document once the comprehend entities and KVPairs have been extracted
    for key, val in opg_output[KVPAIRS].items():
        if key not in comprehendAndMedicalEntities:
            comprehendAndMedicalEntities[key] = val
        else:
            comprehendAndMedicalEntities[key].add(val)
    opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)

    ds = datastore.DocumentStore(documentsTable, outputTable)
    ds.markDocumentComplete(jobTag)

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)

    return {'statusCode': 200, 'body': output}