def init(self, page:fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) self._layout = Layout(raw_layout, page.rotationMatrix) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def initialize(self, page: fitz.Page): '''Initialize layout object.''' # ----------------------------------------- # Layout object based on raw dict # ----------------------------------------- # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # ----------------------------------------- # page size # ----------------------------------------- # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({'width': w, 'height': h}) # ----------------------------------------- # page images # ----------------------------------------- # image bytes from page.getText('rawdict') can't reproduce transparent images, # so we re-extract page images for block in raw_layout['blocks']: # disable image in raw dict if block['type'] == 1: block['type'] = -1 # extract and recover images images = ImagesExtractor.extract_images(page) raw_layout['blocks'].extend(images) # ----------------------------------------- # page paths # ----------------------------------------- # convert vector graphic paths to pixmap self._paths_extractor = PathsExtractor() images, paths = self._paths_extractor.extract_paths(page) raw_layout['blocks'].extend(images) raw_layout['paths'] = paths # init layout self._layout = Layout(raw_layout, page.rotationMatrix) return self._layout
def _page_blocks(page:fitz.Page): '''Get page blocks and adjust image blocks.''' # Layout object based on raw dict: # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # Adjust image blocks: # Image blocks are generated for every image location – whether or not there are any duplicates. # This is in contrast to Page.getImageList(), which will contain each image only once. # https://pymupdf.readthedocs.io/en/latest/textpage.html#dictionary-structure-of-extractdict-and-extractrawdict # # So, a compromise: # - get image contents with `page.getImageList` # - get image location with `page.getText('rawdict')` # # extract and recover images recovered_images = ImagesExtractor.extract_images(page) # group original image blocks by image contents image_blocks_group = defaultdict(list) for block in raw_layout['blocks']: if block['type'] != 1: continue image_blocks_group[hash(block['image'])].append(block) # update raw layout blocks def same_images(img, img_list): bbox = list(map(round, img['bbox'])) for _img in img_list: if list(map(round, _img['bbox']))==bbox: return True return False for image in recovered_images: for k, image_blocks in image_blocks_group.items(): if not same_images(image, image_blocks): continue for image_block in image_blocks: image_block['image'] = image['image'] break # an image outside the page is not counted in page.getText(), so let's add it here else: raw_layout['blocks'].append(image) return raw_layout
def initialize(self, page:fitz.Page): '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) self._layout = Layout(raw_layout, page.rotationMatrix) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) return self._layout
def init(self, page: fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict raw_layout = page.getText('rawdict') self._layout = Layout(raw_layout) # get rectangle shapes from page source: # these shapes are generally converted from docx, e.g. highlight, underline, # which are different from PDF comments like highlight, rectangle. if not page._isWrapped: page._wrapContents() # transformation matrix from PDF to PyMuPDF M = page.transformationMatrix # PyMuPDF>=1.17.0 for xref in page.getContents(): page_content = self._doc_pdf._getXrefStream(xref).decode( encoding="ISO-8859-1") self._layout.rects.from_stream(page_content, M) # get annotations(comment shapes) from PDF page: consider highlight, underline, # strike-through-line only. annots = page.annots() self._layout.rects.from_annotations(annots) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def initialize(self, page:fitz.Page): '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) # pdf paths and converted images self._paths = PathsExtractor() images, paths = self._paths.parse(page).filter_pixmaps(page) raw_layout['blocks'].extend(images) raw_layout['paths'] = paths # init layout self._layout = Layout(raw_layout, page.rotationMatrix) return self._layout
def init(self, page:fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict raw_layout = page.getText('rawdict') new_layout = split_blocks(raw_layout) self._layout = Layout(new_layout) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout