def run(self): if (not self.document.pages): return FileHelper.writeToFile("{}-response.json".format(self.fileName), json.dumps(self.response)) print("Total Pages in Document: {}".format(len(self.document.pages))) p = 1 for page in self.document.pages: FileHelper.writeToFile( "{}-page-{}-response.json".format(self.fileName, p), json.dumps(page.blocks)) self._outputWords(page, p) self._outputText(page, p) if (self.forms): self._outputForm(page, p) self._outputFormTranslate(page, p) if (self.tables): self._outputTable(page, p) self._outputTablePretty(page, p) self._outputTablePrettyTranslate(page, p) p = p + 1
def _outputText(self, page, p): text = page.text FileHelper.writeToFile("{}-page-{}-text.txt".format(self.fileName, p), text) textInReadingOrder = page.getTextInReadingOrder() FileHelper.writeToFile( "{}-page-{}-text-inreadingorder.txt".format(self.fileName, p), textInReadingOrder)
def _outputTablePretty(self, page, p, table_format='github'): for table_number, table in enumerate(page.tables): rows_list = list() for row in table.rows: one_row = list() for cell in row.cells: one_row = one_row + [cell.text] rows_list.append(one_row) pretty_table = tabulate(rows_list, tablefmt=table_format) FileHelper.writeToFile( "{}-page-{}-table-{}-tables-pretty.txt".format( self.fileName, p, table_number), pretty_table)
def _generateInsightsPerDocument(self, page, p, insights, medicalInsights, translate, ta, tma, tt): maxLen = 2000 text = page.text start = 0 sl = len(text) sentiment = [] syntax = [] entities = [] keyPhrases = [] medicalEntities = [] phi = [] translation = "" while(start < sl): end = start + maxLen if(end > sl): end = sl subText = text[start:end] if(insights): self._insights(start, text, sentiment, syntax, entities, keyPhrases, ta) if(medicalInsights): self._medicalInsights(start, text, medicalEntities, phi, tma) if(translate): translation = translation + tt.getTranslation(subText) + "\n" start = end if(insights): FileHelper.writeCSV("{}-page-{}-insights-sentiment.csv".format(self.fileName, p), ["Sentiment"], sentiment) FileHelper.writeCSV("{}-page-{}-insights-entities.csv".format(self.fileName, p), ["Type", "Text", "Score", "BeginOffset", "EndOffset"], entities) FileHelper.writeCSV("{}-page-{}-insights-syntax.csv".format(self.fileName, p), ["PartOfSpeech-Tag", "PartOfSpeech-Score", "Text", "BeginOffset", "EndOffset"], syntax) FileHelper.writeCSV("{}-page-{}-insights-keyPhrases.csv".format(self.fileName, p), ["Text", "Score", "BeginOffset", "EndOffset"], keyPhrases) if(medicalInsights): FileHelper.writeCSV("{}-page-{}-medical-insights-entities.csv".format(self.fileName, p), ["Text", "Type", "Category", "Score", "BeginOffset", "EndOffset"], medicalEntities) FileHelper.writeToFile("{}-page-{}-medical-insights-phi.json".format(self.fileName, p), json.dumps(phi)) if(translate): FileHelper.writeToFile("{}-page-{}-text-translation.txt".format(self.fileName, p), translation)
def _outputTablePrettyTranslate(self, page, p, table_format='github'): tt = None tt = TextTranslater('auto', 'en', 'us-east-1') for table_number, table in enumerate(page.tables): rows_list = list() for row in table.rows: one_row = list() for cell in row.cells: if cell.text != "": one_row = one_row + [tt.getTranslation(cell.text)] else: one_row = one_row + [cell.text] rows_list.append(one_row) pretty_table = tabulate(rows_list, tablefmt=table_format) FileHelper.writeToFile( "{}-page-{}-table-{}-tables-pretty-translated.txt".format( self.fileName, p, table_number), pretty_table)
def _outputText(self, page, p): text = page.text FileHelper.writeToFile("{}-text.txt".format(self.fileName), text) '''textInReadingOrder = page.getTextInReadingOrder()