Exemplo n.º 1
0
def lambda_handler(event, context):
    if (event['status'] <= 0):
        return {**event, "errorMessage": "Status isnt positive"}
    aws_env = {
        **event,
        "bucketName": os.environ.get('DOCUMENTS_BUCKET'),
        "awsRegion": 'eu-west-1',
        "tmpJsonOutput": "/tmp/tmp_result.json",
        "tmpTxtOutput": "/tmp/tmp_result.txt",
        "outputBucket": os.environ.get('DOCUMENTS_BUCKET'),
        "outputNameJson": get_bbox_filename(event['objectName'], ".json"),
        "outputNameTxt": get_bbox_filename(event['objectName'], ".txt"),
        "textractOnly": os.environ.get('TEXTRACT_ONLY'),
        "minCharNeeded": int(os.environ.get('MIN_CHAR_NEEDED')),
        "extract_pdf_lines": os.environ.get('EXTRACT_PDF_LINES'),
    }
    status = {"statusCode": 200, "body": "All right"}
    extract_pdf_lines = aws_env['extract_pdf_lines']
    textract_only = aws_env['textractOnly']
    tmp_folder = "/tmp/pdfToBbox"
    pdf_tmp_path = copy_pdf_to_tmp(tmp_folder, aws_env)

    print("==> aws_env: ", aws_env)
    if textract_only == "false" and is_pdf_has_enough_characters(
            pdf_tmp_path, aws_env['minCharNeeded']) is True:
        print("=> Extracting bounding box with pdfplumber")
        if extract_pdf_lines == "true":
            print("=> Extracting pdf lines bbox")
            pdf = Pdf(pdf_tmp_path, aws_env['tmpJsonOutput'],
                      aws_env['tmpTxtOutput'])
            pdf.parse_pdf()
            pdf.save_in_json()
            pdf.save_in_txt()
            write_bbox_to_s3(aws_env)
        else:
            print("=> Extracting pdf words bbox")
            if execute_pdf_to_bbox(pdf_tmp_path, aws_env['tmpJsonOutput']):
                print("=> Error while trying to get pdf information")
                aws_env["status"] = -1
                aws_env["errorMessage"] = "PDF format not supported."
            else:
                write_bbox_to_s3(aws_env)
    else:
        print("Extracting bounding box with textract")
        #send_to_textract(aws_env)
    aws_env['size'] = S3Helper.getS3FileSize(aws_env['bucketName'],
                                             aws_env['outputNameTxt'],
                                             aws_env['awsRegion'])
    aws_env["s3Url"] = get_new_s3_url(aws_env['s3Url'], "txt")
    aws_env["status"] = status
    aws_env["status"] = 1
    aws_env["errorMessage"] = None
    aws_env["contentType"] = "text/txt"
    aws_env['objectName'] = aws_env['outputNameTxt']
    aws_env["sourceUrl"] = aws_env["s3Url"]
    AwsHelper.refreshTmpFolder(tmp_folder)
    return update_event(aws_env, event)
Exemplo n.º 2
0
def copy_pdf_to_tmp(tmp_folder: str, aws_env: dict) -> str:
    pdf_content = S3Helper.readBytesFromS3(aws_env['bucketName'],
                                           aws_env['objectName'],
                                           aws_env['awsRegion'])
    pdf_tmp = "tmp_0.pdf"
    index = 0
    if os.path.isdir(tmp_folder) is True:
        AwsHelper.refreshTmpFolder(tmp_folder)
    os.makedirs(tmp_folder)
    for _ in os.listdir(tmp_folder):
        if os.path.isfile(os.path.join(tmp_folder, pdf_tmp)) is False:
            break
        pdf_tmp = "tmp_{0}.pdf".format(index)
        index += 1
    pdf_tmp = os.path.join(tmp_folder, pdf_tmp)
    with open(pdf_tmp, "wb") as tmp_file:
        tmp_file.write(pdf_content)
    print("Copy {0} to {1}".format(aws_env["objectName"], pdf_tmp))
    return pdf_tmp
Exemplo n.º 3
0
def lambda_handler(event, context):
    print("=> Event: {0}".format(json.dumps(event)))
    aws_env = {
        **event, "bucketName": os.environ['DOCUMENTS_BUCKET'],
        "outputBucket": os.environ['DOCUMENTS_BUCKET'],
        "awsRegion": "eu-west-1",
        "outputName": get_zip_output(event['objectName'])
    }
    print("=> AWS env: {0}".format(json.dumps(aws_env)))
    tmp_folder = "/tmp/zip_extraction"
    if os.path.isdir(tmp_folder) is True:
        AwsHelper.refreshTmpFolder(tmp_folder)
    extraction_output = os.path.join(tmp_folder, "extractions")
    prepare_output_zip(extraction_output)
    zip_tmp = copy_zip_to_tmp(tmp_folder, aws_env)
    print("[DEBUG]: Extracting {0} into tmp file: {1}".format(
        zip_tmp, extraction_output))
    extract_nested_zip(zip_tmp, extraction_output)
    write_extracted_zip(aws_env, extraction_output)
    aws_env["status"] = 0
    aws_env["errorMessage"] = None
    AwsHelper.refreshTmpFolder(tmp_folder)
    return update_event(aws_env, event)