def save_report_pages(docid, report_num=1): report_path = paths.get_report_name(docid, local_path=True, file_extension='.pdf', file_num=report_num) try: images = convert_from_path(report_path) except exceptions.PDFPageCountError: fname = textractor.textloading.find_file(docid) rep_folder = (paths.get_report_name( docid, local_path=True, file_num=report_num)).split('cr')[0] if not os.path.exists(rep_folder): os.mkdir(rep_folder) if '.tif' in fname: report_in = re.sub('.pdf', '.tif', report_path) textloading.download_report(fname, report_in) with open(report_path, "wb") as f: f.write(img2pdf.convert(open(report_in, "rb"))) else: textloading.download_report(fname, report_path) images = convert_from_path(report_path) for i in range(len(images)): pgpath = paths.get_report_page_path(docid, i + 1) images[i].save(pgpath)
def report2json(report, test=False): if test: local = 'test' else: local = True with open( paths.get_report_name(report.docid, local_path=local, file_extension='.json', file_num=report.filenum), "w") as f: frozen = jsonpickle.encode(report) json.dump(frozen, f)
def save_report_sections(report): if len(report.docinfo.keys()) == 0: return doc = docx.Document() for section in report.section_content: doc.add_heading(section['Heading'], 1) p = doc.add_paragraph() for line in section['Content']: p.add_run(line + '\n') doc.add_page_break() doc.save( paths.get_report_name(report.docid, local_path=True, file_extension='_sections.docx', file_num=report.filenum))
def display_doc( docid): # doc has to be pageinfo type - made for restructpageinfo report_path = paths.get_report_name(docid, local_path=True, file_extension=True) images = convert_from_path(report_path) docfile = open(paths.get_restructpageinfo_file(docid), "r") doc = json.load(docfile) drawn_images = [] # Create image showing bounding box/polygon the detected lines/text for page in doc.items(): i = int(page[0]) - 1 image = images[i] width, height = image.size #draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image) for line in page[1]: # Uncomment to draw bounding box box = line['BoundingBox'] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='green') #image.save(docid + '_' + page[0] + ".jpeg", "JPEG") drawn_images.append(image) save_path = paths.result_path + docid + '_boxed.pdf' if not os.path.exists(save_path): os.makedirs(save_path) drawn_images[0].save(save_path, save_all=True, append_images=drawn_images[1:])
def bookmark_report(report, test=False): if len(report.docinfo.keys()) == 0: return if test: report_file = paths.get_report_name(report.docid, local_path=True, file_extension='_boxed.pdf', file_num=report.filenum) else: report_file = paths.get_report_name(report.docid, local_path=True, file_extension='.pdf', file_num=report.filenum) output = PdfFileWriter() input = PdfFileReader(open(report_file, 'rb')) #'../' + ptrs = report.headings_intext for page in input.pages: output.addPage(page) output.addBookmark('Title Page', 0, fit='/FitB') if report.toc_page: output.addBookmark('Table of Contents', report.toc_page - 1, fit='/FitB') section = None for i, row in ptrs.iterrows(): #page, line = row['PageNum'], row['LineNum'] #lnbb = report.docinfo[page][line-1]['BoundingBox'] if row['Heading'] == 1: section = output.addBookmark(row['Text'], row['PageNum'] - 1, fit='/FitB') elif row['Heading'] == 2: if section: output.addBookmark(row['Text'], row['PageNum'] - 1, parent=section, fit='/FitB') else: output.addBookmark( row['Text'], row['PageNum'] - 1, fit='/FitB') # add as a heading if section doesn't exist refpg = output.getPage(0).mediaBox width, height = float(refpg[2]), float(refpg[3]) # add links between toc lines and their intext section #self.headings_intext, self.subheadings, self.headings if report.toc_page: toc_headings = pd.concat([report.headings, report.subheadings]) for i, row in report.headings_intext.iterrows(): if row.MatchesHeading == 0: continue toc_h = toc_headings.loc[int(row.MatchesI)] toc_bb = report.line_dataset.loc[ (report.line_dataset.PageNum == report.toc_page) & (report.line_dataset.LineNum == toc_h.LineNum)].iloc[0] left = width * toc_bb['Left'] top = height * (1 - toc_bb['Top']) #rectangle = [left, top, left + (width * toc_bb['Width']), top + (height * toc_bb['Height'])] rectangle = [ left, top, left + (width * toc_bb['Width']), top - (height * toc_bb['Height']) ] output.addLink( report.toc_page - 1, row.PageNum - 1, rect=rectangle, fit='/FitB') # creates link from toc heading to section page #outfile = settings.get_report_name(report.docid, local_path=True, file_extension='_bookmarked.pdf') outfile = paths.get_bookmarked_file(report.docid, test=test, filenum=report.filenum) print(outfile) rpath = outfile.rsplit('/', 1)[0] if not os.path.exists(rpath): os.mkdir(rpath) output.write(open(outfile, 'wb'))
def draw_report(report): report_path = paths.get_report_name(report.docid, local_path=True, file_extension='.pdf', file_num=report.filenum) images = convert_from_path(report_path) doc = report.docinfo drawn_images = [] for page in doc.items(): i = int(page[0]) - 1 image = images[i] # this has to be of type RGB width, height = image.size draw = ImageDraw.Draw(image, 'RGBA') if int( page[0] ) in report.marginals['PageNum'].values: # draw bb around marginals lnnum = report.marginals.loc[report.marginals['PageNum'] == int( page[0])]['LineNum'] for ln in lnnum.values: linenum = ln - 1 line = page[1][linenum] box = line['BoundingBox'] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='orange') # draw bb around page number (by comparing marginal content to result of page number extraction) if isinstance(report.page_nums, pd.DataFrame): if int(page[0]) in report.page_nums[ 'PageNum'].values: # draw bb around marginals pg_marginal = report.page_nums.loc[ report.page_nums['PageNum'] == int(page[0])] #pglnnum = pg_marginal['LineNum'] #pglinenum = pglnnum.values[0] - 1 text = pg_marginal.Text.values[0] split_text = text.split('\t') reg = r'(^|\s)' + str( pg_marginal['Page'].values[0] ) + r'($|\s)' # implement returning pagenum position instead? would make this MUCH easier pgnum_i = None for t, i in zip(split_text, range(len(split_text))): if re.search(reg, t): pgnum_i = i break if pgnum_i: box = line['OriginalBBs'][pgnum_i] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='red') #original_marginal_bb = docinfo[pagestr][lineindex]['OriginalBBs'][index in marginal] if page[0] == str(report.toc_page): # change colour of toc page # for i, row in report.toc_dataset.iterrows(): # did this mean to put rectangles around toc headings? # left = width * row['Left'] # top = height * row['Top'] # #draw = ImageDraw.Draw(image) # draw.rectangle([left, top, left + (width * row['Width']), top + (height * row['Height'])], # outline='pink') img_copy = image.copy() background = ImageDraw.Draw(img_copy, 'RGBA') background.rectangle([0, 0, image.size[0], image.size[1]], fill='green') image = Image.blend(img_copy, image, alpha=0.3) if report.fig_pages: if float(page[0]) in report.fig_pages[ 'PageNum'].values: # change colour of fig pages img_copy = image.copy() background = ImageDraw.Draw(img_copy, 'RGBA') background.rectangle([0, 0, image.size[0], image.size[1]], fill='purple') image = Image.blend(image, img_copy, alpha=0.3) #else: # draw bb around section headers if int(page[0]) in report.section_ptrs['PageNum'].values: lnnums = report.section_ptrs.loc[report.section_ptrs['PageNum'] == int(page[0])]['LineNum'] for line in lnnums.values: linenum = line - 1 line = page[1][linenum] box = line['BoundingBox'] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='blue') if int(page[0]) in report.subsection_ptrs['PageNum'].values: lnnums = report.subsection_ptrs.loc[ report.subsection_ptrs['PageNum'] == int(page[0])]['LineNum'] for line in lnnums.values: linenum = line - 1 line = page[1][linenum] box = line['BoundingBox'] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='green') drawn_images.append(image) outfile = paths.get_report_name(report.docid, local_path=True, file_extension='_boxed.pdf', file_num=report.filenum) drawn_images[0].save(outfile, save_all=True, append_images=drawn_images[1:])