Пример #1
0
 def _outputText(self, page, p):
     #text = page.text
     text = page.getTextInReadingOrder()
     opath = "{}page-{}-text.txt".format(self.outputPath, p)
     S3Helper.writeToS3(text, self.bucketName, opath)
     self.saveItem(self.documentId, "page-{}-Text".format(p), opath)
     self.indexDocument(self.bucketName, opath, text)
    def run(self):

        if (not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            opath = "{}page-{}-response.json".format(
                self.outputPath, self.metadata['page_number'])
            S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)

            text_file = self._outputText(page, p)

            docText = docText + page.text + "\n"

            if (self.tables):
                csv_file = self._outputTable(page, p)

            p = p + 1
        return {"csv": csv_file, "text_file": text_file}
Пример #3
0
    def run(self):

        if not self.document.pages:
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)
        self.saveItem(self.documentId, "Response", opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:

            opath = "{}page-{}-response.json".format(self.outputPath, p)
            S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)
            self.saveItem(self.documentId, "page-{}-Response".format(p), opath)

            self._outputText(page, p)

            docText = docText + page.text + "\n"

            if self.forms:
                self._outputForm(page, p)

            if self.tables:
                self._outputTable(page, p)

            p = p + 1
Пример #4
0
    def run(self):

        if(not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(round_floats(prune_blocks(
            self.response)), separators=(',', ':')), self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            docText = docText + page.text + "\n"

            if(self.forms):
                self._outputForm(page, p)

            if(self.tables):
                self._outputTable(page, p)

            p = p + 1

        return docText
    def run(self):

        if(not self.document.pages):
            return

        opath = "{}{}response.json".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX)
        S3Helper.writeToS3(json.dumps(round_floats(prune_blocks(
            self.response)), separators=(',', ':')), self.bucketName, opath)
        self.saveItem(self.documentId, '{}Response'.format(TEXTRACT_PATH_S3_PREFIX), opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            docText = docText + page.text + "\n"

            if(self.forms):
                key_val_pairs = self._outputForm(page, p)

            if(self.tables):
                self._outputTable(page, p)

            p = p + 1
        
        return {DOCTEXT: docText, KVPAIRS: key_val_pairs}
Пример #6
0
def write_extracted_zip(aws_env: dict, zip_tmp: str):
    output_bucket = aws_env['bucketName']
    output_folder = aws_env['outputName']
    aws_region = aws_env['awsRegion']

    print("Writing s3://{0}/{1} in {2}".format(output_bucket, output_folder,
                                               aws_region))
    for path, folders, files in os.walk(zip_tmp):
        print("=> Path: {0}".format(path))
        for file in files:
            print("=> File: {0}".format(files))
            file_path = os.path.join(path, file)
            s3_output_path = os.path.join(output_folder, file)
            try:
                with open(file_path, "r") as open_file:
                    content = open_file.read()
                    print("=> Writing {0} to s3: {0}".format(
                        file_path, s3_output_path))
                    S3Helper.writeToS3(content, output_bucket, s3_output_path,
                                       aws_region)
            except UnicodeDecodeError:
                with open(file_path, "rb") as open_file:
                    content = open_file.read()
                    print("=> Writing to s3: {0}".format(
                        file_path, s3_output_path))
                    S3Helper.writeToS3(content, output_bucket, s3_output_path,
                                       aws_region)
    def run(self):

        if (not self.document.pages):
            return

        opath = "{}response.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""

        p = 1
        for page in self.document.pages:
            # Avoid printing json outputs for every page
            #opath = "{}page-{}-response.json".format(self.outputPath, p)
            #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)
            #self.saveItem(self.documentId, "page-{}-Response".format(p), opath)
            # Avoid printing text outputs at a page level
            #self._outputText(page, p)

            docText = docText + page.text + "\n"

            if (self.forms):
                self._outputForm(page, p)

            if (self.tables):
                self._outputTable(page, p)

            p = p + 1
        opath = "{}response.txt".format(self.outputPath)
        S3Helper.writeToS3(docText, self.bucketName, opath)
        self.saveItem(self.documentId, 'Response', opath)
Пример #8
0
def write_bbox_to_s3(aws_env: dict) -> None:
    with open(aws_env['tmpJsonOutput'], "r") as file:
        content = file.read()
        S3Helper.writeToS3(content, aws_env['outputBucket'],
                           aws_env['outputNameJson'], aws_env['awsRegion'])
    with open(aws_env['tmpTxtOutput'], "r") as file:
        content = file.read()
        S3Helper.writeToS3(content, aws_env['outputBucket'],
                           aws_env['outputNameTxt'], aws_env['awsRegion'])
    def _outputText(self, page, p):
        page_number = self.metadata['page_number']
        text = page.text
        opath = "{}page-{}-text.txt".format(self.outputPath, page_number)
        S3Helper.writeToS3(text, self.bucketName, opath)
        textInReadingOrder = page.getTextInReadingOrder()
        opath = "{}page-{}-text-inreadingorder.txt".format(
            self.outputPath, page_number)
        S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)

        return opath
    def _outputText(self, page, p):
        text = page.text
        opath = "{}{}page-{}-text.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p)
        S3Helper.writeToS3(text, self.bucketName, opath)
        self.saveItem(self.documentId, "{}page-{}-Text".format(TEXTRACT_PATH_S3_PREFIX, p), opath)

        textInReadingOrder = page.getTextInReadingOrder()
        opath = "{}{}page-{}-text-inreadingorder.txt".format(self.outputPath,TEXTRACT_PATH_S3_PREFIX, p)
        S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)
        self.saveItem(self.documentId,
                      "{}page-{}-TextInReadingOrder".format(TEXTRACT_PATH_S3_PREFIX, p), opath)
Пример #11
0
    def _outputText(self, page, p, no_write=False):
        text = page.text
        textInReadingOrder = page.getTextInReadingOrder()

        if no_write:
            return (text, textInReadingOrder)
        else:
            opath = "{}/page-{}/text.txt".format(self.outputPath, p)
            opath = "{}/page-{}/text-inreadingorder.txt".format(
                self.outputPath, p)
            S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)
            S3Helper.writeToS3(text, self.bucketName, opath)
    def processComprehendMedicalICD10(self,
                                      comprehendMedicalICD10,
                                      numOfPages,
                                      bucket,
                                      comprehendOutputPath):

        data = {}
        data['results'] = []

        for p in range(0, numOfPages):
            page = {}
            # page numbers start at 1
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendMedicalICD10[p]:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Category'] = e['Category']
                    entity['Type'] = e['Type']

                    entity['ICD10CMConcepts'] = []

                    if 'ICD10CMConcepts' in e:

                        for c in e['ICD10CMConcepts']:
                            concept = {}
                            concept['Description'] = c['Description']
                            concept['Code'] = c['Code']
                            concept['Score'] = c['Score']
                            entity['ICD10CMConcepts'].append(concept)

                    page['Entities'].append(entity)
                    entities.add(e['Text'].upper())

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(json.dumps(data), bucket,
                           comprehendOutputPath + "comprehendMedicalICD10.json")
Пример #13
0
def spacy_sentences_extraction(content: str, aws_env: dict):
    excluded_pipeline = ["tagger", "ner", "textcat", "parser"]
    model_path = "/opt/python/xx_ent_wiki_sm/xx_ent_wiki_sm-2.3.0"
    sentence_content = ""

    if os.path.isdir(model_path) is False:
        model_path = "xx_ent_wiki_sm"
    nlp = spacy.load(model_path, disable=excluded_pipeline)
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(content)
    print("Pipelines names: ", nlp.pipe_names)
    for sent in doc.sents:
        sentence = sent.text.replace('\n', ' ')
        sentence_content += "{}\n".format(sentence.strip())
    S3Helper.writeToS3(sentence_content, aws_env['outputBucket'],
                       aws_env['outputNameTxt'], aws_env['awsRegion'])
    def processAndReturnComprehendMedicalEntities(self,
                                                  comprehendMedicalEntities,
                                                  numOfPages, bucket,
                                                  comprehendOutputPath):

        data = {}
        data['results'] = []
        medical_entities_to_index = {}

        for p in range(0, numOfPages):
            page = {}
            # page numbers start at 1
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendMedicalEntities[p]:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Category'] = e['Category']

                    if 'Score' in e:
                        entity['Score'] = e['Score']

                    page['Entities'].append(entity)

                    if e['Category'] not in medical_entities_to_index:
                        medical_entities_to_index[e['Category']] = []
                    medical_entities_to_index[e['Category']].append(e['Text'])

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(
            json.dumps(data), bucket,
            comprehendOutputPath + "comprehendMedicalEntities.json")
        return medical_entities_to_index
Пример #15
0
    def processComprehendEntities(self,
                                  comprehendEntities,
                                  numOfPages,
                                  bucket,
                                  documentPath):

        data = {}
        data['results'] = []
        entities_to_index = {}

        # process comprehend entities for each page
        for p in range(0, numOfPages):
            page = {}
            # page number start at 1 but list of page data starts at 0
            page['Page'] = p + 1
            page['Entities'] = []

            # to detect and skip duplicates
            entities = set()

            for e in comprehendEntities[p]['Entities']:

                # add this entity if not already present
                if e['Text'].upper() not in entities:
                    # add entity to results list
                    entity = {}
                    entity['Text'] = e['Text']
                    entity['Type'] = e['Type']
                    entity['Score'] = e['Score']
                    page['Entities'].append(entity)

                    if e['Type'] not in entities_to_index:
                        entities_to_index[e['Type']] = []
                    entities_to_index[e['Type']].append(e['Text'])

                    # make a note of this added entity
                    entities.add(e['Text'].upper())

            data['results'].append(page)

        # create results file in S3 under document folder
        S3Helper.writeToS3(json.dumps(data), bucket,
                           documentPath + "comprehendEntities.json")
        return entities_to_index
Пример #16
0
 def writeTextractOutputs(self, taggingStr=None):
     if not self.document.pages:
         return
     docText = ""
     p = 1
     for page in self.document.pages:
         opath = "{}/page-{}/response.json".format(self.outputPath, p)
         S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath,
                            taggingStr)
         self._outputText(page, p)
         docText = docText + page.text + "\n"
         if (self.forms):
             self._outputForm(page, p)
         if (self.tables):
             self._outputTable(page, p)
         p = p + 1
     # Write the whole output for it to then be used for comprehend
     opath = "{}/fullresponse.json".format(self.outputPath)
     print("Total Pages in Document: {}".format(len(self.document.pages)))
     S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath,
                        taggingStr)
def processImage(itemId, bucketName, objectName, outputBucketName,
                 itemsTableName):

    apiName = objectName.split("/")[0]

    response = callRekognition(bucketName, objectName, apiName)

    print("Generating output for ItemId: {}".format(itemId))
    print(response)

    outputPath = "sync/{}-analysis/{}/".format(objectName, itemId)
    opath = "{}response.json".format(outputPath)
    S3Helper.writeToS3(json.dumps(response), outputBucketName, opath)

    #opg = OutputGenerator(itemId, response, bucketName, objectName, detectForms, detectTables, ddb)
    #opg.run()

    print("ItemId: {}".format(itemId))

    ds = datastore.ItemStore(itemsTableName)
    ds.markItemComplete(itemId)
Пример #18
0
def processRequest(request):

    output = ""

    print(request)

    jobId = request['jobId']
    jobTag = request['jobTag']
    jobStatus = request['jobStatus']
    jobAPI = request['jobAPI']
    bucketName = request['bucketName']
    objectName = request['objectName']
    outputBucket = request["outputBucket"]
    itemsTable = request["itemsTable"]

    pages = getJobResults(jobAPI, jobId)

    print("Result pages received: {}".format(len(pages)))
    print(pages)

    outputPath = "async/{}-analysis/{}/".format(objectName, jobTag)
    opath = "{}response.json".format(outputPath)
    S3Helper.writeToS3(json.dumps(pages), outputBucket, opath)

    #opg = OutputGenerator(jobTag, pages, bucketName, objectName, ddb)
    #opg.run()

    print("ItemId: {}".format(jobTag))

    ds = datastore.ItemStore(itemsTable)
    ds.markItemComplete(jobTag)

    output = "Processed -> Item: {}, Object: {}/{} processed.".format(jobTag, bucketName, objectName)

    print(output)

    return {
        'statusCode': 200,
        'body': output
    }
Пример #19
0
    def _outputText(self, page, p):
        text = page.text
        opath = "{}page-{}-text.txt".format(self.outputPath, p)
        S3Helper.writeToS3(text, self.bucketName, opath)
        self.saveItem(self.documentId, "page-{}-Text".format(p), opath)

        textInReadingOrder = page.getTextInReadingOrder()
        opath = "{}page-{}-text-inreadingorder.txt".format(self.outputPath, p)
        S3Helper.writeToS3(textInReadingOrder, self.bucketName, opath)
        self.saveItem(self.documentId, "page-{}-TextInReadingOrder".format(p),
                      opath)

        entityAnalysis = self.awsComprehend(textInReadingOrder)
        opath = "{}page-{}-text-entity.txt".format(self.outputPath, p)
        S3Helper.writeToS3(entityAnalysis, self.bucketName, opath)
        self.saveItem(self.documentId, "page-{}-EntityText".format(p), opath)
Пример #20
0
    def run(self):

        if(not self.document.pages):
            return

        #opath = "{}response.json".format(self.outputPath)
        #S3Helper.writeToS3(json.dumps(self.response), self.bucketName, opath)
        #self.saveItem(self.documentId, 'Response', opath)

        print("Total Pages in Document: {}".format(len(self.document.pages)))

        docText = ""
        docTextInReadingOrder = ""

        jsonPages = list(map(lambda x : x.blocks, self.document.pages))
        opath = "{}pages.json".format(self.outputPath)
        S3Helper.writeToS3(json.dumps(jsonPages), self.bucketName, opath)
        self.saveItem(self.documentId, "All pages", opath)

        p = 1
        for page in self.document.pages:

            #opath = "{}page-{}-response.json".format(self.outputPath, p)
            #S3Helper.writeToS3(json.dumps(page.blocks), self.bucketName, opath)
            #self.saveItem(self.documentId, "page-{}-Response".format(p), opath)

            self._outputText(page, p)

            docText = docText + page.text + "\n"
            docTextInReadingOrder = docTextInReadingOrder + page.getTextInReadingOrder() + "\n"

            if(self.forms):
                self._outputForm(page, p)

            if(self.tables):
                self._outputTable(page, p)

            p = p + 1

        opath = "{}text.txt".format(self.outputPath)
        S3Helper.writeToS3(docText, self.bucketName, opath)

        opath = "{}text-inreadingorder.txt".format(self.outputPath)
        S3Helper.writeToS3(docTextInReadingOrder, self.bucketName, opath)
 def test_write_to_s3(self):
     S3Helper.writeToS3("Hello World", BUCKET_NAME, S3_FILE_NAME, REGION)
     body = self.conn.Object(
         BUCKET_NAME, S3_FILE_NAME).get()['Body'].read().decode('utf-8')
     self.assertEqual(body, "Hello World")