def init_test(self, filename): ''' Initialize parsed layout and benchmark layout.''' # restore sample layout layout_file = os.path.join(self.layout_dir, f'{filename}.json') with open(layout_file, 'r') as f: raw_dict = json.load(f) self.sample = Layout(raw_dict) # parsed layout: first page only pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf') docx_file = os.path.join(self.output_dir, f'{filename}.docx') cv = Converter(pdf_file, docx_file) cv.make_page(cv[0], debug=False) self.test = cv.layout # type: Layout cv.close() return self
def local_test(filename, make_test_case=False): pdf_file = os.path.join(output, f'{filename}.pdf') docx_file = os.path.join(output, f'{filename}.docx') cv = Converter(pdf_file, docx_file) # process page by page for page in cv[0:1]: # print(page.rotation, page.rotationMatrix) # print(page.transformationMatrix) # print(page.rect, page.MediaBox, page.CropBox) # print(page.xref) # print(page.getContents()) # print(cv.doc_pdf.xrefObject(page.xref)) # page.cleanContents() # c = page.readContents().decode(encoding="ISO-8859-1") # with open('c.txt', 'w') as f: # f.write(c) # print(cv.doc_pdf.xrefObject(94)) # with open('x.svg', 'w') as f: # f.write(page.getSVGimage(text_as_path=False)) # parse layout cv.make_page(page) # # extract tables # tables = cv.extract_tables(page) # for table in tables: # print(table) cv.close() # close pdf # check results check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
for page in cv[0:1]: # print(page.rotation, page.rotationMatrix) # print(page.transformationMatrix) # print(page.rect, page.MediaBox, page.CropBox) # print(page.xref) # print(page.getContents()) # print(cv.doc_pdf.xrefObject(page.xref)) # page.cleanContents() # c = page.readContents().decode(encoding="ISO-8859-1") # with open('c.txt', 'w') as f: # f.write(c) # print(cv.doc_pdf.xrefObject(94)) # with open('x.svg', 'w') as f: # f.write(page.getSVGimage(text_as_path=False)) # parse layout cv.make_page(page) # # extract tables # tables = cv.extract_tables(page) # for table in tables: # print(table) cv.close() # close pdf # check results # check_result(pdf_file, docx_file, 'comparison.pdf')