def init_backward_sync(self, event): if not self.layouter.has_layout: return False y_total_pixels = min( max(event.y - self.layouter.vertical_margin, 0), (self.layouter.page_height + self.layouter.page_gap) * self.preview.number_of_pages - self.layouter.page_gap) x_pixels = min(max(event.x - self.layouter.horizontal_margin, 0), self.layouter.page_width) page = math.floor(y_total_pixels / (self.layouter.page_height + self.layouter.page_gap)) y_pixels = min( max( y_total_pixels - page * (self.layouter.page_height + self.layouter.page_gap), 0), self.layouter.page_height) x = x_pixels / self.layouter.scale_factor y = y_pixels / self.layouter.scale_factor page += 1 with self.preview.poppler_document_lock: poppler_page = self.preview.poppler_document.get_page(page - 1) rect = Poppler.Rectangle() rect.x1 = max(min(x, self.preview.page_width), 0) rect.y1 = max(min(y, self.preview.page_height), 0) rect.x2 = max(min(x, self.preview.page_width), 0) rect.y2 = max(min(y, self.preview.page_height), 0) word = poppler_page.get_selected_text(Poppler.SelectionStyle.WORD, rect) context = poppler_page.get_selected_text( Poppler.SelectionStyle.LINE, rect) self.preview.document.backward_sync(page, x, y, word, context)
def pdf_area(page, stroke): """ Get PDF page area for a stroke. :param page: Poppler PDF page object. :param stroke: reMarkable tablet stroke data. """ to_x = lambda p: to_point(p)[0] to_y = lambda p: to_point(p)[1] x1 = min(to_x(s) for s in stroke.segments) x2 = max(to_x(s) for s in stroke.segments) y1 = min(to_y(s) for s in stroke.segments) y2 = max(to_y(s) for s in stroke.segments) factor = pdf_scale(page) area = Poppler.Rectangle() area.x1 = (x1 - 15) * factor area.y1 = y1 * factor area.x2 = (x2 + 15) * factor area.y2 = y2 * factor assert area.x1 < area.x2 assert area.y1 < area.y2 return area
def get_annot_action(self, link_type, action, rect): """ Get the function to be called when the link is followed. Args: link_type (:class:`~Poppler.ActionType`): The link type action (:class:`~Poppler.Action`): The action to be performed when the link is clicked rect (:class:`~Poppler.Rectangle`): The region of the page where the link is Returns: `function`: The function to be called to follow the link """ if link_type == Poppler.ActionType.RENDITION: media = action.rendition.media if media.is_embedded(): ext = get_extension(media.get_mime_type()) with tempfile.NamedTemporaryFile('wb', suffix=ext, prefix='pdf_embed_', delete=False) as f: # now the file name is shotgunned filename = f.name self.parent.remove_on_exit(filename) if not media.save(filename): logger.error(_("Pympress can not extract embedded media")) return None else: filename = self.parent.get_full_path(media.get_filename()) if not filename: logger.error( _("Pympress can not find file ") + media.get_filename()) return None # TODO grab the show_controls, autoplay, repeat relative_margins = Poppler.Rectangle() relative_margins.x1 = rect.x1 / self.pw # left relative_margins.x2 = 1.0 - rect.x2 / self.pw # right relative_margins.y1 = rect.y1 / self.ph # bottom relative_margins.y2 = 1.0 - rect.y2 / self.ph # top media = (relative_margins, filename, False) self.medias.append(media) return Link.build_closure(self.parent.play_media, hash(media)) else: return self.get_link_action(link_type, action)
def main(): print('Version:', Poppler.get_version()) path = sys.argv[1] if not os.path.isabs(path): path = os.path.join(os.getcwd(), path) d = Poppler.Document.new_from_file('file:' + path) n = d.get_n_pages() for pg_no in range(n): p = d.get_page(pg_no) print('Page %d' % (pg_no + 1), 'size ', p.get_size()) text = p.get_text().decode('UTF-8') locs = get_page_layout(p) fonts = p.get_text_attributes() offset = 0 cfont = 0 for line in text.splitlines(True): print( ' ', line.encode('UTF-8'), ) n = len(line) for i in range(n): if line[i] == u'\n': continue font = fonts[cfont] while font.start_index > i + offset or font.end_index < i + offset: cfont += 1 if cfont >= len(fonts): font = None break font = fonts[cfont] bb = locs[offset + i] print( line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb, ) if font: print( font.font_name, font.font_size, 'r=%d g=%d, b=%d' % (font.color.red, font.color.green, font.color.blue)) offset += n print() print()
def init_backward_sync(self, event): if self.preview.layout == None: return False window_width = self.view.get_allocated_width() y_total_pixels = min( max(event.y, 0), (self.preview.layout.page_height + self.preview.layout.page_gap) * self.preview.poppler_document.get_n_pages() - self.preview.layout.page_gap) x_pixels = min( max( event.x - self.preview.layout.get_horizontal_margin(window_width), 0), self.preview.layout.page_width) page = math.floor( y_total_pixels / (self.preview.layout.page_height + self.preview.layout.page_gap)) y_pixels = min( max( y_total_pixels - page * (self.preview.layout.page_height + self.preview.layout.page_gap), 0), self.preview.layout.page_height) x = x_pixels / self.preview.layout.scale_factor y = y_pixels / self.preview.layout.scale_factor page += 1 poppler_page = self.preview.poppler_document.get_page(page - 1) rect = Poppler.Rectangle() rect.x1 = max(min(x, self.preview.page_width), 0) rect.y1 = max(min(y, self.preview.page_height), 0) rect.x2 = max(min(x, self.preview.page_width), 0) rect.y2 = max(min(y, self.preview.page_height), 0) word = poppler_page.get_selected_text(Poppler.SelectionStyle.WORD, rect) context = poppler_page.get_selected_text(Poppler.SelectionStyle.LINE, rect) self.preview.document.build_system.backward_sync( page, x, y, word, context)
def get_annot_action(self, link_type, action, rect): """ Get the function to be called when the link is followed. """ if link_type == Poppler.ActionType.RENDITION: media = action.rendition.media if media.is_embedded(): ext = get_extension(media.get_mime_type()) with tempfile.NamedTemporaryFile('wb', suffix=ext, prefix='pdf_embed_', delete=False) as f: # now the file name is shotgunned filename = f.name self.parent.remove_on_exit(filename) if not media.save(filename): print(_("Pympress can not extract embedded media")) return None else: filename = self.parent.get_full_path(media.get_filename()) if not filename: print( _("Pympress can not find file ") + media.get_filename()) return None # TODO grab the show_controls, autoplay, repeat relative_margins = Poppler.Rectangle() relative_margins.x1 = rect.x1 / self.pw # left relative_margins.x2 = 1.0 - rect.x2 / self.pw # right relative_margins.y1 = rect.y1 / self.ph # bottom relative_margins.y2 = 1.0 - rect.y2 / self.ph # top media = (relative_margins, filename, False) self.medias.append(media) return lambda: pympress.ui.UI.play_media(hash(media)) else: return self.get_link_action(link_type, action)
def main(): print 'Version:', Poppler.get_version() path=sys.argv[1] if not os.path.isabs(path): path=os.path.join(os.getcwd(), path) d=Poppler.Document.new_from_file('file:'+path) n=d.get_n_pages() for pg_no in range(n): p=d.get_page(pg_no) print 'Page %d' % (pg_no+1), 'size ', p.get_size() text=p.get_text().decode('UTF-8') locs=get_page_layout(p) fonts=p.get_text_attributes() offset=0 cfont=0 for line in text.splitlines(True): print ' ', line.encode('UTF-8'), n=len(line) for i in range(n): if line[i]==u'\n': continue font=fonts[cfont] while font.start_index > i+offset or font.end_index < i+offset: cfont+=1 if cfont>= len(fonts): font=None break font=fonts[cfont] bb=locs[offset+i] print line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb, if font: print font.font_name, font.font_size, 'r=%d g=%d, b=%d'%(font.color.red, font.color.green, font.color.blue), offset+=n print print
def get_structure(self, index_iter=None): """ Gets the structure of the document from its index. Recursive, pass the iterator. Args: index_iter (:class:`~Poppler.IndexIter` or `None`): the iterator for the child index to explore. Returns: `list`: A list of tuples (depth, page number, title) """ try: if index_iter is None: index_iter = Poppler.IndexIter(self.doc) except TypeError: return {} if index_iter is None: return {} index = {} while True: action = index_iter.get_action() title = '' try: if action.type == Poppler.ActionType.GOTO_DEST: title = action.goto_dest.title if action.goto_dest.dest.type == Poppler.DestType.NAMED: dest = self.parent.doc.find_dest( action.goto_dest.dest.named_dest) page = dest.page_num - 1 elif action.goto_dest.dest.type == Poppler.DestType.UNKNOWN: raise AssertionError('Unknown type of destination') else: page = action.goto_dest.dest.page_num - 1 else: raise AssertionError('Unexpected type of action') except Exception: logger.error( _('Unexpected action in index "{}"').format(action.type)) page = None new_entry = {'title': title} child = index_iter.get_child() if child: new_entry['children'] = self.get_structure(child) # there should not be synonymous sections, correct the page here to a better guess if page is None or page in index: if 'children' in new_entry: page = min(new_entry['children']) else: lower_bound = max(index) find = index[lower_bound] while 'children' in find: lower_bound = max(find) find = find[lower_bound] try: page = min( l for l, n in enumerate(self.page_labels) if n == self.page_labels[page] and l > lower_bound) except ValueError: # empty iterator page = lower_bound + 1 index[page] = new_entry if not index_iter.next(): break return index
def __init__(self, page, number, parent): self.page = page self.page_nb = number self.parent = parent self.page_label = self.page.get_label() self.links = [] self.medias = [] self.annotations = [] # Read page size self.pw, self.ph = self.page.get_size() # Read links on the page for link in self.page.get_link_mapping(): action = self.get_link_action(link.action.type, link.action) my_link = Link(link.area.x1, link.area.y1, link.area.x2, link.area.y2, action) self.links.append(my_link) # Read annotations, in particular those that indicate media for annotation in self.page.get_annot_mapping(): content = annotation.annot.get_contents() if content: self.annotations.append(content) annot_type = annotation.annot.get_annot_type() if annot_type == Poppler.AnnotType.LINK: # just an Annot, not subclassed -- probably redundant with links continue elif annot_type == Poppler.AnnotType.MOVIE: movie = annotation.annot.get_movie() filepath = self.parent.get_full_path(movie.get_filename()) if filepath: # TODO there is no autoplay, or repeatCount relative_margins = Poppler.Rectangle() relative_margins.x1 = annotation.area.x1 / self.pw # left relative_margins.x2 = 1.0 - annotation.area.x2 / self.pw # right relative_margins.y1 = annotation.area.y1 / self.ph # bottom relative_margins.y2 = 1.0 - annotation.area.y2 / self.ph # top media = (relative_margins, filepath, movie.show_controls()) self.medias.append(media) action = Link.build_closure(self.parent.play_media, hash(media)) else: logger.error( _("Pympress can not find file ") + movie.get_filename()) continue elif annot_type == Poppler.AnnotType.SCREEN: action_obj = annotation.annot.get_action() if not action_obj: continue action = self.get_annot_action(action_obj.any.type, action_obj, annotation.area) if not action: continue elif annot_type == Poppler.AnnotType.FILE_ATTACHMENT: attachment = annotation.annot.get_attachment() prefix, ext = os.path.splitext(attachment.name) with tempfile.NamedTemporaryFile('wb', suffix=ext, prefix=prefix, delete=False) as f: # now the file name is shotgunned filename = f.name self.parent.remove_on_exit(filename) if not attachment.save(filename): logger.error(_("Pympress can not extract attached file")) continue action = Link.build_closure(fileopen, filename) elif annot_type in { Poppler.AnnotType.TEXT, Poppler.AnnotType.POPUP, Poppler.AnnotType.FREE_TEXT }: # text-only annotations, hide them from screen self.page.remove_annot(annotation.annot) continue elif annot_type in { Poppler.AnnotType.STRIKE_OUT, Poppler.AnnotType.HIGHLIGHT, Poppler.AnnotType.UNDERLINE, Poppler.AnnotType.SQUIGGLY, Poppler.AnnotType.POLYGON, Poppler.AnnotType.POLY_LINE, Poppler.AnnotType.SQUARE, Poppler.AnnotType.CIRCLE, Poppler.AnnotType.CARET, Poppler.AnnotType.LINE, Poppler.AnnotType.STAMP, Poppler.AnnotType.INK }: # Poppler already renders annotation of these types, nothing more can be done # even though the rendering isn't always perfect. continue else: logger.warning( _("Pympress can not interpret annotation of type:") + " {} ".format(annot_type)) continue my_annotation = Link(annotation.area.x1, annotation.area.y1, annotation.area.x2, annotation.area.y2, action) self.links.append(my_annotation)
def __init__(self, page, number, parent): """ Args: doc (:class:`Poppler.Page`): the poppler object around the page number (integer): number of the page to fetch in the document parent (:class:`pympress.document.Document`): the parent Document class """ self.page = page self.page_nb = number self.parent = parent self.links = [] self.medias = [] self.annotations = [] # Read page size self.pw, self.ph = self.page.get_size() # Read links on the page for link in self.page.get_link_mapping(): action = self.get_link_action(link.action.type, link.action) my_link = Link(link.area.x1, link.area.y1, link.area.x2, link.area.y2, action) self.links.append(my_link) # Read annotations, in particular those that indicate media for annotation in self.page.get_annot_mapping(): annot_type = annotation.annot.get_annot_type() if annot_type == Poppler.AnnotType.LINK: # just an Annot, not subclassed -- probably redundant with links continue elif annot_type == Poppler.AnnotType.MOVIE: movie = annotation.annot.get_movie() filepath = self.parent.get_full_path(movie.get_filename()) if filepath: # TODO there is no autoplay, or repeatCount relative_margins = Poppler.Rectangle() relative_margins.x1 = annotation.area.x1 / self.pw # left relative_margins.x2 = 1.0 - annotation.area.x2 / self.pw # right relative_margins.y1 = annotation.area.y1 / self.ph # bottom relative_margins.y2 = 1.0 - annotation.area.y2 / self.ph # top media = (relative_margins, filepath, movie.show_controls()) self.medias.append(media) action = lambda: pympress.ui.UI.play_media(hash(media)) else: logger.error( _("Pympress can not find file ") + movie.get_filename()) continue elif annot_type == Poppler.AnnotType.SCREEN: action_obj = annotation.annot.get_action() action = self.get_annot_action(action_obj.any.type, action_obj, annotation.area) if not action: continue elif annot_type == Poppler.AnnotType.TEXT: self.annotations.append(annotation.annot.get_contents()) # hide post-it sort of button on screen self.page.remove_annot(annotation.annot) continue elif annot_type == Poppler.AnnotType.FREE_TEXT: # Poppler already renders annotation of this type continue else: logger.warning( _("Pympress can not interpret annotation of type:") + " {} ".format(annot_type)) continue my_annotation = Link(annotation.area.x1, annotation.area.y1, annotation.area.x2, annotation.area.y2, action) self.links.append(my_annotation)
import os import re import logging import tempfile import io from typing import Dict, Union from distutils.version import LooseVersion import cairo import gi gi.require_version('Poppler', '0.18') from gi.repository import Poppler, GLib from . import abstract poppler_version = Poppler.get_version() if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover raise ValueError("mat2 needs at least Poppler version 0.46 to work. \ The installed version is %s." % poppler_version) # pragma: no cover class PDFParser(abstract.AbstractParser): mimetypes = { 'application/pdf', } meta_list = { 'author', 'creation-date', 'creator', 'format', 'keywords', 'metadata', 'mod-date', 'producer', 'subject', 'title', 'viewer-preferences' } def __init__(self, filename):