示例#1
0
def test_words(json_response):
    doc = Document(json_response)
    assert 1 == len(doc.pages)
    lines = [line for line in doc.pages[0].lines]
    assert 22 == len(lines)
    words = [word for line in lines for word in line.words]
    assert 53 == len(words)
示例#2
0
def AnalyzeDocument(byteImages):
    """
    Function that analyze a image and return one document
    
    params:
        * image bytearry
    
    return:
        * None: erro in read image
        * Document: result of image
    """

    f = []
    logging.info('Analizando arquivo(s)')
    for byteImage in byteImages:
        response = TEXTRACT.analyze_document(Document={'Bytes': byteImage},
                                             FeatureTypes=["FORMS"])

        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            f.append(Document(response))

        else:
            logging.info('Erro ao Analizar arquivo')
            return None

    return f
示例#3
0
def lambda_handler(event, context):
    print("- - - Amazon Textract Demo - - -")
    # read the bucket name from the event
    name_of_the_bucket = event['Records'][0]['s3']['bucket']['name']
    # read the object from the event
    name_of_the_doc = event['Records'][0]['s3']['object']['key']
    print(name_of_the_bucket)
    print(name_of_the_doc)
    # Starts the asynchronous analysis of an input document for relationships between detected items such as key-value pairs, tables, and selection elements.
    # API ref : https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html#Textract.Client.start_document_analysis
    # textract_response = textract_client.start_document_text_detection(DocumentLocation={'S3Object': {'Bucket': name_of_the_bucket,'Name': name_of_the_doc}})
    response = textract_client.analyze_document(
        Document={
            'S3Object': {
                'Bucket': name_of_the_bucket,
                'Name': name_of_the_doc
            }
        },
        FeatureTypes=["TABLES", "FORMS"])
    print(str(response))
    doc = Document(response)
    for page in doc.pages:
        # Print tables
        for table in page.tables:
            for r, row in enumerate(table.rows):
                for c, cell in enumerate(row.cells):
                    print("Table[{}][{}] = {}".format(r, c, cell.text))
    for page in doc.pages:
        # Print fields
        print("Fields:")
        for field in page.form.fields:
            print("Key: {}, Value: {}".format(field.key, field.value))
def run():
    filePath = "temp-response.json"
    response = json.loads(FileHelper.readFile(filePath))

    doc = Document(response)

    #print(doc)
    processDocument(doc)
示例#5
0
文件: og.py 项目: keshava/jarvis-be
 def __init__(self, response, forms, tables, **kwargs):
     self.response = response
     self.forms = forms
     self.tables = tables
     self.documentId = kwargs.get("documentId", None)
     self.bucketName = kwargs.get("bucketName", None)
     self.objectName = kwargs.get("objectName", None)
     self.outputPath = "{}/ocr-analysis".format(self.objectName)
     self.document = Document(self.response)
示例#6
0
def run():
    response = {}

    filePath = "test-response.json"
    with open(filePath, 'r') as document:
        response = json.loads(document.read())

    doc = Document(response)
    processDocument(doc)
    def __init__(self, response, bucketName, objectName, tables, metadata):
        self.response = response
        self.bucketName = bucketName
        self.objectName = objectName
        self.tables = tables
        self.metadata = metadata

        self.outputPath = "{}-analysis/".format(objectName, objectName)

        self.document = Document(self.response)
示例#8
0
def analyze():
    # Call Amazon Textract
    response = textract.analyze_document(
        Document={'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }},
        FeatureTypes=["FORMS", "TABLES"])

    doc = Document(response)
    lines(doc)
示例#9
0
 def __init__(self, documentId, response, bucketName, objectName, forms,
              tables, ddb):
     self.documentId = documentId
     self.response = response
     self.bucketName = bucketName
     self.objectName = objectName
     self.forms = forms
     self.tables = tables
     self.ddb = ddb
     self.outputPath = "{}-analysis/{}/".format(objectName, documentId)
     self.document = Document(self.response)
示例#10
0
def analyze():
    response = textract.analyze_document(
        Document={'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }},
        FeatureTypes=["FORMS", "TABLES"])

    doc = Document(response)
    #lines(doc)
    #forms(doc)
    tables(doc)
示例#11
0
def get_Lines(pgno, data):
    a, x = data
    doc = Document(a)
    w, h = x.size
    lines = list()
    for page in doc.pages:
        tbList = list()
        for table in page.tables:
            is_table = True
            for row in table.rows:
                if len(row.cells) <= 2:
                    is_table = False
            if is_table:
                t = list()
                for row in table.rows:
                    r = list()
                    dummy = [r.append([cell.text]) for cell in row.cells]
                    t.append(r)
                tablesList.append((pgno, t))

                tbList.append(table.geometry)
        for line in page._lines:
            inside = False
            for rect in tbList:
                if not (line.geometry.boundingBox.left >= rect.boundingBox.left
                        and
                        line.geometry.boundingBox.top >= rect.boundingBox.top
                        and line.geometry.boundingBox.left +
                        line.geometry.boundingBox.width <=
                        rect.boundingBox.left + rect.boundingBox.width
                        and line.geometry.boundingBox.top +
                        line.geometry.boundingBox.height <=
                        rect.boundingBox.top + rect.boundingBox.height):

                    inside = True
                    if len(tbList) == 1:
                        lines.append(line)
                        inside = False
            if not inside and len(tbList) > 1:
                lines.append(line)
                inside = False
            elif len(tbList) == 0:
                lines.append(line)
    lines = [{
        "Text": line.text,
        "Page": pgno,
        "Left": line.geometry.boundingBox.left * w,
        "Top": line.geometry.boundingBox.top * h,
        "Height": line.geometry.boundingBox.height * h,
        "Width": line.geometry.boundingBox.width * w
    } for line in lines]
    return pd.DataFrame(lines)
示例#12
0
def invokeTextract(bucketName, documentKey):
    print('Loading invokeTextract')
    # Call Amazon Textract
    response = textract.analyze_document(
        Document={'S3Object': {
            'Bucket': bucketName,
            'Name': documentKey
        }},
        FeatureTypes=["FORMS"])

    document = Document(response)

    return document
示例#13
0
def get_tables_from_pdf(s3BucketName,documentName):
    jobId = startJob(s3BucketName, documentName)
    print("Started job with id: {}".format(jobId))
    status = isJobComplete(jobId)
    if(status=="SUCCEEDED"):
        response = getJobResults(jobId)
        doc = Document(response)
        csv_tables =[]
        for page in doc.pages:
            for table in page.tables:
                csv_tables.append(generate_csv_from_table(table))
        #tables = get_table_responses(response[0]) # Get first item in response
        return csv_tables
    elif(status=="FAILED"):
        return {"message":status}
示例#14
0
def get_text_analysis(bucket_name, key):
    response_analysis = client_text.analyze_document(
        Document={'S3Object': {
            'Bucket': bucket_name,
            'Name': key
        }},
        FeatureTypes=['FORMS'])
    extract = Document(response_analysis)
    form_ext = []
    for page in extract.pages:
        print("Key Value Pairs:")
        for headings in page.form.fields:
            print("Detected Key: {}, Detected Value: {}".format(
                headings.key, headings.value))
            form_ext.append((str(headings.key), str(headings.value)))

    return form_ext
示例#15
0
def s3_handler(record):
    #process using S3 object
    response = txt_client.analyze_document(Document={
        'S3Object': {
            'Bucket': record['s3']['bucket']['name'],
            'Name': record['s3']['object']['key'],
        }
    },
                                           FeatureTypes=["TABLES"])
    grid_id = os.path.splitext(record['s3']['object']['key'].replace(
        'incoming/', ''))[0]
    #Get the text blocks
    doc = Document(response)
    input_matrix = []
    for page in doc.pages:
        # Print tables
        for table in page.tables:
            for r, row in enumerate(table.rows):
                for c, cell in enumerate(row.cells):
                    number = cell.text.replace('NOT_SELECTED,', '').replace(
                        'SELECTED,', '').replace(' ', '')
                    if number == '':
                        number = 0
                    try:
                        input_matrix += [int(number)]
                    except:
                        input_matrix += [number]
                    #print("Table[{}][{}] = {}".format(r, c, ttt))
    if len(input_matrix) == 81 and all(
        [isinstance(i, int) for i in input_matrix]):
        input_matrix = np.matrix(input_matrix).reshape(9, 9)
    else:
        dynamodb_table = boto3.resource('dynamodb').Table('sudokuGridRecords')
        print('Grid not recognized')
        send_sol_to_db = dynamodb_table.put_item(
            Item={
                'grid_id': grid_id,
                'input': json.dumps(np.array(input_matrix).ravel().tolist()),
                'solution': 'Grid could not be read',
            })
        raise Exception(f'Sudoku not detected in picture {grid_id}')
    return grid_id, input_matrix
示例#16
0
def handler(event, context):
    try:
        srcbucket = event["Bucket"]
        srckey = event["Key"]
    except KeyError as ke:
        raise MalformedRequest(
            f"Missing field {ke}, please check your input payload")

    # Load and parse Textract result from S3
    textract_result = json.load(s3.Object(srcbucket, srckey).get()["Body"])
    doc = Document(textract_result)

    # Define post processing variables
    amount_form_keys = ["total", "amount"]
    date_form_keys = ["date"]
    text = ""

    # Since we're just taking the first line as the vendor name, there'll be exactly one candidate:
    vendor_name_result = {"Confidence": 0, "Value": ""}
    vendor_name_candidates = [vendor_name_result]

    # For the other fields, we'll search for multiple options:
    date_candidates = []
    total_amount_candidates = []

    # Receipts don't usually list out a key-value pair like "Vendor: XYZ", the business name is just the
    # first thing on the receipt! So we'll make that our assumption to extract vendor:
    for item in textract_result["Blocks"]:
        if item["BlockType"] == "LINE":
            if vendor_name_result["Value"] == "":
                vendor_name_result["Value"] = item["Text"]
                # Setting the vendor name confidence = raw OCR confidence is a bit lazy and optimistic,
                # because we're not applying any reduction to reflect the fact that taking first line of
                # text = vendor name is an *assumption*... But it'll do for our sample:
                vendor_name_result["Confidence"] = item["Confidence"]
            else:
                # While we're looping through blocks anyway, we'll also collect all the text from the receipt
                # into a single string to search with Comprehend later:
                text += item["Text"] + " "

    # For amount and date fields, we'll try searching the key-value pairs first:
    # TODO: Refactor this loop for efficiency
    for page in doc.pages:
        for key in amount_form_keys:
            fields = page.form.searchFieldsByKey(key)
            for field in fields:
                # TODO: This should re-use amount_form_keys
                if (("total" in field.key.text.lower()
                     or "amount" in field.key.text.lower())
                        and field.value is not None):
                    try:
                        # If it's the total, the value should be parseable as a number!
                        # TODO: Allow for other leading currency symbols and 3-letter-acronyms
                        a = float(field.value.text.lstrip("$"))
                        total_amount_candidates.append({
                            # Again because we're post-processing, our output "Confidence" scores should be
                            # driven by the Textract outputs but adjusted to reflect our business
                            # understanding... We'll take another pretty simple choice here:
                            "Confidence":
                            min(field.key.confidence, field.value.confidence),
                            "Value":
                            field.value.text,
                        })
                    except Exception as e:
                        print("Cannot proceed String to Number {}".format(
                            field.value.text))

        for key in date_form_keys:
            fields = page.form.searchFieldsByKey(key)
            for field in fields:
                if "date" in field.key.text.lower(
                ) and field.value is not None:
                    date_candidates.append({
                        "Confidence":
                        min(field.key.confidence, field.value.confidence),
                        "Value":
                        field.value.text
                    })

    # If we couldn't find any date-looking fields in the key-value pairs (likely for verbose invoice-style
    # documents, but not for shorrt receipts), then we'll use Amazon Comprehend to just detect date entities:
    if not len(date_candidates) > 0:
        comprehend_entities = comprehend.detect_entities(
            Text=text, LanguageCode="en")["Entities"]
        for entity in comprehend_entities:
            if entity.get("Type") == "DATE":
                value_str = entity.get("Text").strip("\t\n\r")
                # A little bit of validation that it looks date-like:
                if "/" in value_str or ":" in value_str or "-" in value_str:
                    date_candidates.append({
                        # Comprehend scores confidence 0-1 while Textract does 0-100: Doesn't matter which we
                        # standardize on as long as we choose one! Again, could improve this confidence score
                        # by factoring in things like how confident the Textract OCR was on that span of text
                        "Confidence":
                        entity.get("Score", 0) * 100,
                        "Value":
                        value_str
                    })

    # Sort our candidates by descending confidence and take the highest confidence candidate for each field:
    date_candidates = sorted(date_candidates,
                             key=lambda c: c["Confidence"],
                             reverse=True)
    total_amount_candidates = sorted(total_amount_candidates,
                                     key=lambda c: c["Confidence"],
                                     reverse=True)
    date_result = date_candidates[0] if len(date_candidates) else None
    total_amount_result = total_amount_candidates[0] if len(
        total_amount_candidates) else None

    result = {
        "Date": {
            "Confidence": date_result["Confidence"] if date_result else 0,
            "Value": date_result["Value"] if date_result else "",
        },
        "Total": {
            "Confidence":
            total_amount_result["Confidence"] if total_amount_result else 0,
            "Value":
            total_amount_result["Value"] if total_amount_result else "",
        },
        "Vendor": {
            "Confidence":
            vendor_name_result["Confidence"] if vendor_name_result else 0,
            "Value":
            vendor_name_result["Value"] if vendor_name_result else "",
        },
    }

    # How do we measure composite result "Confidence" for many fields driven by different logics? We'll just
    # take the minimum, since a human review should be triggered by the weakest field.
    result["Confidence"] = min(
        map(lambda f: result[f]["Confidence"], result.keys()))
    if len(date_candidates) > 1:
        result["Date"]["Alternatives"] = date_candidates[1:]
    if len(total_amount_candidates) > 1:
        result["Total"]["Alternatives"] = total_amount_candidates[1:]
    if len(vendor_name_candidates) > 1:
        result["Vendor"]["Alternatives"] = vendor_name_candidates[1:]

    return result
def process_text_analysis(bucket, document):

    #Get the document from S3
    s3_connection = boto3.resource('s3')

    s3_object = s3_connection.Object(bucket, document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image = Image.open(stream)

    # Analyze the document
    client = boto3.client('textract')

    image_binary = stream.getvalue()
    response = client.analyze_document(Document={'Bytes': image_binary},
                                       FeatureTypes=["TABLES", "FORMS"])
    # response = client.start_document_analysis(Document={'Bytes': image_binary},
    #     FeatureTypes=["TABLES", "FORMS"])

    # Alternatively, process using S3 object
    #response = client.analyze_document(
    #    Document={'S3Object': {'Bucket': bucket, 'Name': document}},
    #    FeatureTypes=["TABLES", "FORMS"])
    doc = Document(response)

    #Get the text blocks
    blocks = response['Blocks']
    width, height = image.size
    draw = ImageDraw.Draw(image)
    print('Detected Document Text')

    # Create image showing bounding box/polygon the detected lines/text
    if doc.pages:
        page = doc.pages[0]
        for field in page.form.fields:
            if (field.key and field.value
                    and "social security" in field.key.text.lower()):
                print("!!!FOUND SOCIAL!!!")
                x1 = field.value.geometry.boundingBox.left * width
                y1 = field.value.geometry.boundingBox.top * height - 2
                x2 = x1 + (field.value.geometry.boundingBox.width * width) + 5
                y2 = y1 + (field.value.geometry.boundingBox.height *
                           height) + 2

                draw.rectangle([x1, y1, x2, y2], fill="Black")
    for block in blocks:

        DisplayBlockInformation(block)

        draw = ImageDraw.Draw(image)

        # if block['BlockType'] == "KEY_VALUE_SET":
        #     if block['EntityTypes'][0] == "KEY":
        #         ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
        #     else:
        #         ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')

        # if block['BlockType'] == 'LINE':
        #     ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')

        # if block['BlockType'] == 'WORD':
        #     ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'red')
        # if 'Confidence' in block:
        #     if block['BlockType'] == 'WORD':
        #         if block['Confidence'] <= 85:
        #             ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'red')
        #         elif 85 <= block['Confidence'] <= 98:
        #             ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')
        #         elif block['Confidence'] > 98:
        #             ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'green')

        for block in blocks:
            if block['BlockType'] == "KEY_VALUE_SET":
                if 'KEY' in block['EntityTypes']:
                    ShowBoundingBox(draw, block['Geometry']['BoundingBox'],
                                    width, height, 'red')

                else:
                    ShowBoundingBox(draw, block['Geometry']['BoundingBox'],
                                    width, height, 'blue')
            # uncomment to draw polygon for all Blocks
            points = []
            # for polygon in block['Geometry']['Polygon']:
            #    points.append((width * polygon['X'], height * polygon['Y']))
            # draw.polygon((points), outline='blue')

    # Display the image
    image.show()
    return len(blocks)
示例#18
0
def runComprehend(bucketName, objectName, callerId):

    comprehend = AwsHelper().getClient('comprehend')
    documentId, documentName = dissectObjectName(objectName)
    assert (
        documentId == S3Helper().getTagsS3(bucketName,
                                           objectName).get('documentId', None)
    ), "File path {} does not match the expected documentId tag of the object triggered.".format(
        objectName)

    textractOutputJson = json.loads(S3Helper().readFromS3(
        bucketName, objectName))
    og = OutputGenerator(response=textractOutputJson,
                         forms=False,
                         tables=False)

    pipeline_client.body = {
        "documentId": documentId,
        "bucketName": bucketName,
        "objectName": objectName,
        "stage": PIPELINE_STAGE
    }
    pipeline_client.stageInProgress()

    document = Document(textractOutputJson)
    originalFileName = "{}/{}".format(documentId, documentName)
    comprehendFileName = originalFileName + "/comprehend-output.json"
    comprehendFileS3Url = "https://{}.s3.amazonaws.com/{}".format(
        comprehendBucket, urllib.parse.quote_plus(comprehendFileName,
                                                  safe="/"))
    tagging = "documentId={}".format(documentId)

    es.connect()
    esPayload = []
    page_num = 1
    for page in document.pages:
        table = og.structurePageTable(page)
        forms = og.structurePageForm(page)
        text = og.structurePageText(page)

        keyPhrases = []
        entitiesDetected = {}

        lenOfEncodedText = len(text)
        print("Comprehend documentId {} processing page {}".format(
            documentId, str(page_num)))
        print("Length of encoded text is " + str(lenOfEncodedText))
        if lenOfEncodedText > COMPREHEND_CHARACTER_LIMIT:
            print(
                "Size was too big to run singularly; breaking up the page text into chunks"
            )
            try:
                chunksOfText = chunkUpTheText(text)
            except Exception as e:
                pipeline_client.stageFailed(
                    "Could not determine how to snip the text on page {} into chunks."
                    .format(page_num))
                raise (e)
            keyPhrases, entitiesDetected = batchSendToComprehend(
                comprehend, chunksOfText, 'en')
        else:
            keyPhrases, entitiesDetected = singularSendToComprehend(
                comprehend, text, 'en')

        esPageLoad = compileESPayload(es, page_num, keyPhrases,
                                      entitiesDetected, text, table, forms,
                                      documentId)
        esPayload.append(esPageLoad)
        page_num = page_num + 1

    try:
        es.post_bulk(index=esIndex, payload=esPayload)
    except Exception as e:
        pipeline_client.stageFailed("Could not post to Elasticsearch")
        raise (e)

    print("Data uploaded to ES")
    try:
        S3Helper().writeToS3(json.dumps(esPayload),
                             comprehendBucket,
                             comprehendFileName,
                             taggingStr=tagging)
    except Exception as e:
        pipeline_client.stageFailed("Failed to write comprehend payload to S3")
        raise (e)

    lineage_client.recordLineage({
        "documentId": documentId,
        "callerId": callerId,
        "sourceBucketName": bucketName,
        "targetBucketName": comprehendBucket,
        "sourceFileName": objectName,
        "targetFileName": comprehendFileName
    })
    pipeline_client.stageSucceeded()
    print("Comprehend data uploaded to S3 at {}".format(comprehendFileName))
import boto3
from trp import Document

# Document
documentName = "employmentapp.png"

# Amazon Textract client
textract = boto3.client('textract')

# Call Amazon Textract
with open(documentName, "rb") as document:
    response = textract.analyze_document(Document={
        'Bytes': document.read(),
    },
                                         FeatureTypes=["TABLES"])

#print(response)

doc = Document(response)

for page in doc.pages:
    # Print tables
    for table in page.tables:
        for r, row in enumerate(table.rows):
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))
def handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    # Get the object from the event and show its content type
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    print("key is" + key)
    print("bucket is" + bucket)
    text = ""
    textvalues = []
    textvalues_entity = {}
    try:
        s3.Bucket(bucket).download_file(Key=key, Filename='/tmp/{}')
        # Read document content
        with open('/tmp/{}', 'rb') as document:
            imageBytes = bytearray(document.read())
        print("Object downloaded")
        response = textract.analyze_document(Document={'Bytes': imageBytes},
                                             FeatureTypes=["TABLES", "FORMS"])
        document = Document(response)
        table = []
        forms = []
        #print(document)
        for page in document.pages:
            table = outputTable(page)
            forms = outputForm(page)
        print(table)
        blocks = response['Blocks']
        for block in blocks:
            if block['BlockType'] == 'LINE':
                text += block['Text'] + "\n"
        print(text)
        # Extracting Key Phrases
        keyphrase_response = comprehend.detect_key_phrases(Text=text,
                                                           LanguageCode='pt')
        KeyPhraseList = keyphrase_response.get("KeyPhrases")
        for s in KeyPhraseList:
            textvalues.append(s.get("Text"))

        detect_entity = comprehend.detect_entities(Text=text,
                                                   LanguageCode='pt')
        EntityList = detect_entity.get("Entities")
        for s in EntityList:
            textvalues_entity.update([(s.get("Type").strip('\t\n\r'),
                                       s.get("Text").strip('\t\n\r'))])

        s3url = 'https://s3.console.aws.amazon.com/s3/object/' + bucket + '/' + key + '?region=' + region

        searchdata = {
            's3link': s3url,
            'KeyPhrases': textvalues,
            'Entity': textvalues_entity,
            'text': text,
            'table': table,
            'forms': forms
        }
        print(searchdata)
        print("connecting to ES")
        es = connectES()
        #es.index(index="resume-search", doc_type="_doc", body=searchdata)
        es.index(index="document", doc_type="_doc", body=searchdata)
        print("data uploaded to Elasticsearch")
        return 'keyphrases Successfully Uploaded'
    except Exception as e:
        print(e)
        print('Error: ')
        raise e
示例#21
0
def test_forms(json_response):
    doc = Document(json_response)
    assert 4 == len(doc.pages[0].form.fields)
示例#22
0
def test_tables(json_response):
    doc = Document(json_response)
    assert 1 == len(doc.pages[0].tables)
def lambda_handler(event, context):
    """
    Get Extraction Status, JobTag and JobId from SNS. 
    If the Status is SUCCEEDED then create a dict of the values and write those to the RDS database.
    """
    #print(event)

    pattern_caja = [
        'efectivoyvalore', 'efectiv..equiv', 'efectivoyrequiv',
        'cajaydisponible', 'efectivoequivalentesalefectivo'
        'efectivoybancos', 'vequivalentesdeefectivo', '^efectivo$'
    ]

    pattern_ingreso = [
        'totaldeingresosoperacionales', 'totalingresosoperacionales',
        'ingresosporventa', "ingresosdeactividadesordinarias",
        'ingreso.*ordinaria'
    ]

    pattern_patrimonio = [
        "^totalpatrimoni.$", "patrimoni.de", "totalcapital", "^patrimonia$",
        "patrimoni.neto", "patrimoni.total"
    ]

    pattern_activos = [
        "^totalactivo$",
        "totalactiv.s$",
        "totaldeactiv.s$",
        "totaldelactiv.$",
        "tolallactiv.$",
        "activototal$",
        "activototal$",
        "activototal$",
    ]

    pattern_pasivos = [
        "^pasivototal$", "^totalpasivo$", "t.t.lpasiv.s$", "t.t.lpasiv.$",
        "t.t.ldelp.siv.$"
        "t.t.ldep.siv.s$"
    ]

    pattern_costosventas = [
        "costodeventa", "costodeproduccion", "totaldecostos", "gastosdeventas",
        "gastosdeventa", "costosfinancieros", "costosdeventas"
    ]

    pattern_utopera = [
        "resultadosporactividadesdeoperacion", "utilidadopera",
        "utilidaddeoperacion", "gananciaporactividadesdeoperacion",
        "perdidaopera", "utilidad.*perdida.*opera"
    ]

    pattern_ubruta = [
        "utilidadbruta", "ganancia.*bruta", "utilidad.*delperiodo",
        "utilidadporactividadesoperacionales", "margen.*brut", "EBITDA",
        "utilidad.*antes.*impuesto.*ganancias"
    ]

    pattern_antes_imp = ["antes"]

    pattern_uneta = [
        "utilidadneta", "ganancianeta", "Ganancia.*perdida*neto",
        "utilidadnetaconsolidada", "resultadonetodelano", "resultadointegral"
    ]

    mypatterns = {
        'p_caja': pattern_caja,
        'p_ingreso': pattern_ingreso,
        'p_patrimonio': pattern_patrimonio,
        'p_activos': pattern_activos,
        'p_pasivos': pattern_pasivos,
        'p_cventas': pattern_costosventas,
        'p_utopera': pattern_utopera,
        'p_antesimp': pattern_antes_imp,
        'p_uneta': pattern_uneta,
        'p_ubruta': pattern_ubruta
    }
    notificationMessage = json.loads(
        json.dumps(event))['Records'][0]['Sns']['Message']

    pdfTextExtractionStatus = json.loads(notificationMessage)['Status']
    pdfTextExtractionJobTag = json.loads(notificationMessage)['JobTag']
    pdfTextExtractionJobId = json.loads(notificationMessage)['JobId']

    #print(pdfTextExtractionJobTag + ' : ' + pdfTextExtractionStatus)

    try:
        if (pdfTextExtractionStatus == 'SUCCEEDED'):
            response = getJobResultsAllinOne(pdfTextExtractionJobId)
            doc = Document(response)
    except:
        if (pdfTextExtractionStatus == 'SUCCEEDED'):
            response = getJobResults(pdfTextExtractionJobId)
            doc = Document(response)
            response = response[0]

    doc_key = pdfTextExtractionJobTag[:-4]

    query = make_query(doc, response, doc_key, mypatterns)

    connection = get_connection()

    cursor = connection.cursor()

    cursor.execute(query)

    connection.commit()

    cursor.close()
示例#24
0
    def ocr(self, filename):
        # Call Amazon Textract and extract like table
        self.response = self.textract.analyze_document(
            Document={
                'S3Object': {
                    'Bucket': 'bucketname',  # give bucket name
                    'Name': filename  # give key name of file present in S3
                }
            },
            FeatureTypes=["TABLES"])  # table format
        return self.response


tex_connect_obj = textract_connect()
tex_connect_obj.upload_s3()
response = tex_connect_obj.ocr('keyname')

doc = Document(response)  #after extracting, making it as document

for page in doc.pages:
    #     print(type(page))
    for table in page.tables:
        #         print(table.rows)
        for r, row in enumerate(table.rows):
            #         for row in table.rows:
            #             print(row)
            for c, cell1 in enumerate(row.cells):
                #                 print(cell)
                print("Table[{}][{}] = {}".format(r, c, cell1.text))
import random
import boto3
from trp import Document

bucket = "textract-acord-poc"
filename = "Multipage_test.pdf"

client = boto3.client('textract')
response = client.start_document_analysis(
    DocumentLocation={'S3Object': {
        'Bucket': bucket,
        'Name': filename
    }},
    FeatureTypes=["TABLES", "FORMS"])
import pdb
pdb.set_trace()
print(response)
res = client.get_document_analysis(JobId=response["JobId"])
doc = Document(res)
print(doc)
示例#26
0
doco_for_scan = "Below.jpg"

# Setup Boto3 client
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html

textract = boto3.client('textract', region_name="us-east-1")

response = textract.analyze_document(
    Document={'S3Object': {
        'Bucket': Bucket,
        'Name': doco_for_scan
    }},
    FeatureTypes=["FORMS"])
# 'TABLES'|'FORMS'

extract = Document(response)

# Start looping
for page in extract.pages:
    print("Key Value Pairs:")
    for headings in page.form.fields:
        print("Detected Key: {}, Detected Value: {}".format(
            headings.key, headings.value))

# Amazon Translate client
translate = boto3.client('translate')

# Use the same response to translate

print('Lets Translate using AWS translate')
for item in response["Blocks"]:
def test_custom_page_orientation(json_response):
    doc = Document(json_response)
    assert 1 == len(doc.pages)
    lines = [line for line in doc.pages[0].lines]
    assert 22 == len(lines)
    words = [word for line in lines for word in line.words]
    assert 53 == len(words)
    t_document: t2.TDocument = t2.TDocumentSchema().load(json_response)
    t_document.custom = {'orientation': 180}
    new_t_doc_json = t2.TDocumentSchema().dump(t_document)
    assert "Custom" in new_t_doc_json
    assert "orientation" in new_t_doc_json["Custom"]
    assert new_t_doc_json["Custom"]["orientation"] == 180

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -1 < t_document.pages[0].custom['Orientation'] < 2

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib_10_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 5 < t_document.pages[0].custom['Orientation'] < 15

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__15_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 10 < t_document.pages[0].custom['Orientation'] < 20

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__25_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 17 < t_document.pages[0].custom['Orientation'] < 30

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__180_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 170 < t_document.pages[0].custom['Orientation'] < 190

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__270_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -100 < t_document.pages[0].custom['Orientation'] < -80

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__90_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 80 < t_document.pages[0].custom['Orientation'] < 100

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__minus_10_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -10 < t_document.pages[0].custom['Orientation'] < 5

    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    for page in doc.pages:
        assert page.custom['Orientation']