Exemplo n.º 1
0
def processImage(documentId, bucketName, objectName, callerId):

    response = callTextract(bucketName, objectName)

    print("Generating output for documentId: {}".format(documentId))

    opg = OutputGenerator(documentId=documentId,
                          response=response,
                          bucketName=textractBucketName,
                          objectName=objectName,
                          forms=False,
                          tables=False)
    tagging = "documentId={}".format(documentId)
    opg.writeTextractOutputs(taggingStr=tagging)

    lineage_client.recordLineage({
        "documentId": documentId,
        "callerId": callerId,
        "sourceBucketName": bucketName,
        "targetBucketName": textractBucketName,
        "sourceFileName": objectName,
        "targetFileName": objectName
    })
Exemplo n.º 2
0
def processRequest(request):

    output = ""
    status = request['jobStatus']
    jobId = request['jobId']
    jobTag = request['jobTag']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']

    pipeline_client.body = {
        "documentId": jobTag,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    if status == 'FAILED':
        pipeline_client.stageFailed(
            "Textract Analysis didn't complete successfully")
        raise Exception(
            "Textract job for document ID {}; bucketName {} fileName {}; failed during Textract analysis. Please double check the document quality"
            .format(jobTag, bucketName, objectName))

    pipeline_client.stageInProgress()
    try:
        pages = getJobResults(jobAPI, jobId)
    except Exception as e:
        pipeline_client.stageFailed()
        raise (e)

    print("Result pages received: {}".format(len(pages)))

    detectForms = False
    detectTables = False
    if (jobAPI == "StartDocumentAnalysis"):
        detectForms = True
        detectTables = True

    try:
        opg = OutputGenerator(documentId=jobTag,
                              response=pages,
                              bucketName=textractBucketName,
                              objectName=objectName,
                              forms=detectForms,
                              tables=detectTables)
    except Exception as e:
        pipeline_client.stageFailed(
            "Could not convert results from Textract into processable object. Try uploading again."
        )
        raise (e)

    tagging = "documentId={}".format(jobTag)
    opg.writeTextractOutputs(taggingStr=tagging)

    lineage_client.recordLineage({
        "documentId": jobTag,
        "callerId": request["callerId"],
        "sourceBucketName": bucketName,
        "targetBucketName": textractBucketName,
        "sourceFileName": objectName,
        "targetFileName": objectName
    })

    output = "Processed -> Document: {}, Object: {}/{} processed.".format(
        jobTag, bucketName, objectName)
    pipeline_client.stageSucceeded()
    print(output)
    return {'statusCode': 200, 'body': output}