def main(): print('Version:', Poppler.get_version()) path = sys.argv[1] if not os.path.isabs(path): path = os.path.join(os.getcwd(), path) d = Poppler.Document.new_from_file('file:' + path) n = d.get_n_pages() for pg_no in range(n): p = d.get_page(pg_no) print('Page %d' % (pg_no + 1), 'size ', p.get_size()) text = p.get_text().decode('UTF-8') locs = get_page_layout(p) fonts = p.get_text_attributes() offset = 0 cfont = 0 for line in text.splitlines(True): print( ' ', line.encode('UTF-8'), ) n = len(line) for i in range(n): if line[i] == u'\n': continue font = fonts[cfont] while font.start_index > i + offset or font.end_index < i + offset: cfont += 1 if cfont >= len(fonts): font = None break font = fonts[cfont] bb = locs[offset + i] print( line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb, ) if font: print( font.font_name, font.font_size, 'r=%d g=%d, b=%d' % (font.color.red, font.color.green, font.color.blue)) offset += n print() print()
def main(): print 'Version:', Poppler.get_version() path=sys.argv[1] if not os.path.isabs(path): path=os.path.join(os.getcwd(), path) d=Poppler.Document.new_from_file('file:'+path) n=d.get_n_pages() for pg_no in range(n): p=d.get_page(pg_no) print 'Page %d' % (pg_no+1), 'size ', p.get_size() text=p.get_text().decode('UTF-8') locs=get_page_layout(p) fonts=p.get_text_attributes() offset=0 cfont=0 for line in text.splitlines(True): print ' ', line.encode('UTF-8'), n=len(line) for i in range(n): if line[i]==u'\n': continue font=fonts[cfont] while font.start_index > i+offset or font.end_index < i+offset: cfont+=1 if cfont>= len(fonts): font=None break font=fonts[cfont] bb=locs[offset+i] print line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb, if font: print font.font_name, font.font_size, 'r=%d g=%d, b=%d'%(font.color.red, font.color.green, font.color.blue), offset+=n print print
import os import re import logging import tempfile import io from typing import Dict, Union from distutils.version import LooseVersion import cairo import gi gi.require_version('Poppler', '0.18') from gi.repository import Poppler, GLib from . import abstract poppler_version = Poppler.get_version() if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover raise ValueError("mat2 needs at least Poppler version 0.46 to work. \ The installed version is %s." % poppler_version) # pragma: no cover class PDFParser(abstract.AbstractParser): mimetypes = { 'application/pdf', } meta_list = { 'author', 'creation-date', 'creator', 'format', 'keywords', 'metadata', 'mod-date', 'producer', 'subject', 'title', 'viewer-preferences' } def __init__(self, filename):