def getResults(bucketName, outputPath): content = { "responseByPage": json.loads( S3Helper.readFromS3(bucketName, "{}pages.json".format(outputPath))), "fullText": S3Helper.readFromS3(bucketName, "{}text.txt".format(outputPath)), "fullTextReadingOrder": S3Helper.readFromS3(bucketName, "{}text-inreadingorder.txt".format(outputPath)) } return content
def getPageResponse(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-response.json".format(doc["objectName"], doc["documentId"], page) responseJson = json.loads(S3Helper.readFromS3(doc["bucketName"], fileName)) doc["textractResponse"] = responseJson output = {} if(doc): output = doc return output
def getPageForm(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if(doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-forms.csv".format(doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) doc["textractResponse"] = parsePairs(file) output = {} print(output) if(doc): output = doc return output
def getPageTable(request): documentsTable = request["documentsTable"] outputTable = request["outputTable"] documentId = request["documentId"] page = request["page"] ds = datastore.DocumentStore(documentsTable, outputTable) doc = ds.getDocument(documentId) if (doc and doc["documentStatus"] == "SUCCEEDED"): fileName = "{}-analysis/{}/page-{}-tables.csv".format( doc["objectName"], doc["documentId"], page) file = S3Helper.readFromS3(doc["bucketName"], fileName) tables = parseTables(getTableFromString(file)) output = {"tables": []} if (tables): output["tables"] = tables return output
def processComprehend(self, bucket, textractResponseLocation, comprehendOutputPath, maxPages=200): # get textract results from S3 textractFile = S3Helper.readFromS3( bucket, textractResponseLocation) textract = json.loads(textractFile) # total number of textracted pages numOfPages = self.getNumOfPages(textract) # error if numOfPages <= 0: return False # enforce a maximum of pages to be processed if numOfPages > maxPages: numOfPages = maxPages # iterate over results page by page and extract raw text for comprehend rawPages = [""] * numOfPages if self.extractTextByPages(textract, rawPages, numOfPages) == False: return False # process pages by batches of 25 max, determine how many batches we need numOfBatches = int(numOfPages / PAGES_PER_BATCH) if numOfPages % PAGES_PER_BATCH != 0: numOfBatches += 1 # to store comprehend and medical API calls results. comprehendEntities = [None] * numOfPages comprehendMedicalEntities = [None] * numOfPages comprehendMedicalICD10 = [None] * numOfPages pagesProcessed = 0 # process pages by batch for batch in range(0, numOfBatches): pageStartIndex = batch * PAGES_PER_BATCH pagesToProcess = numOfPages - pagesProcessed if pagesToProcess > PAGES_PER_BATCH: pagesToProcess = PAGES_PER_BATCH # keep track of all threads we spawn threads = list() # Comprehend call that can batch up to 25 pages together synchronously x = threading.Thread(target=self.batchComprehendDetectEntitiesSync, args=(rawPages, pagesToProcess, pageStartIndex, comprehendEntities)) x.start() threads.append(x) # comprehendMedicalEntities is shared among threads medicalEntitiesMutex = threading.Lock() # ComprehendMedical for index in range(0, pagesToProcess): # Comprehend Medical can only handle one page at a time synchronously. The SDK handles # throttling by the service. x = threading.Thread(target=self.comprehendMedicalDetectEntitiesSync, args=(rawPages, pageStartIndex + index, comprehendMedicalEntities, medicalEntitiesMutex)) x.start() threads.append(x) # comprehendMedicalEntities is shared among threads medicalICD10Mutex = threading.Lock() # ComprehendMedical for index in range(0, pagesToProcess): # Comprehend Medical can only handle one page at a time synchronously. The SDK handles # throttling by the service. x = threading.Thread(target=self.comprehendMedicalDetectICD10Sync, args=(rawPages, pageStartIndex + index, comprehendMedicalICD10, medicalICD10Mutex)) x.start() threads.append(x) # wait on all threads to finish their work for index, thread in enumerate(threads): thread.join() print("all threads joined...") # check success of threads for i in range(pageStartIndex, pagesToProcess): if (comprehendEntities[pageStartIndex + i] == None) or (comprehendMedicalEntities[pageStartIndex + i] == None): print("a page failed to process" + str(i)) return False # increment the number of pages processed for the next batch pagesProcessed += pagesToProcess # process comprehend data, create the entities result file in S3 processedComprehendData = self.processAndReturnComprehendEntities(comprehendEntities, numOfPages, bucket, comprehendOutputPath) # process comprehend medical data, create the entities result file in S3 comprehendMedicalEntities = self.processAndReturnComprehendMedicalEntities(comprehendMedicalEntities, numOfPages, bucket, comprehendOutputPath) # final list of comprehend and comprehend medical entities to be indexed processedComprehendData.update(comprehendMedicalEntities) # process comprehend medical data, create the ICD10 result file in S3 self.processComprehendMedicalICD10(comprehendMedicalICD10, numOfPages, bucket, comprehendOutputPath) return processedComprehendData
def test_read_from_s3(self): self.conn.Object(BUCKET_NAME, S3_FILE_NAME).put(Body="Test") body = S3Helper.readFromS3(BUCKET_NAME, S3_FILE_NAME, REGION) self.assertEqual(body, "Test")
def get_file_content(aws_env: dict): name_path_s3, _ = os.path.splitext(aws_env["objectName"]) txt_path_s3 = name_path_s3 + ".txt" return S3Helper.readFromS3(aws_env['bucketName'], txt_path_s3, aws_env['awsRegion'])