Пример #1
0
def main():

    print('Version:', Poppler.get_version())
    path = sys.argv[1]
    if not os.path.isabs(path):
        path = os.path.join(os.getcwd(), path)
    d = Poppler.Document.new_from_file('file:' + path)
    n = d.get_n_pages()
    for pg_no in range(n):
        p = d.get_page(pg_no)
        print('Page %d' % (pg_no + 1), 'size ', p.get_size())
        text = p.get_text().decode('UTF-8')
        locs = get_page_layout(p)
        fonts = p.get_text_attributes()
        offset = 0
        cfont = 0
        for line in text.splitlines(True):
            print(
                ' ',
                line.encode('UTF-8'),
            )
            n = len(line)
            for i in range(n):
                if line[i] == u'\n':
                    continue
                font = fonts[cfont]
                while font.start_index > i + offset or font.end_index < i + offset:
                    cfont += 1
                    if cfont >= len(fonts):
                        font = None
                        break
                    font = fonts[cfont]

                bb = locs[offset + i]
                print(
                    line[i].encode('UTF-8'),
                    '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb,
                )
                if font:
                    print(
                        font.font_name, font.font_size, 'r=%d g=%d, b=%d' %
                        (font.color.red, font.color.green, font.color.blue))
            offset += n
            print()
        print()
Пример #2
0
def main():
    
    print 'Version:', Poppler.get_version()
    path=sys.argv[1]
    if not os.path.isabs(path):
        path=os.path.join(os.getcwd(), path)
    d=Poppler.Document.new_from_file('file:'+path)
    n=d.get_n_pages()
    for pg_no in range(n):
        p=d.get_page(pg_no)
        print 'Page %d' % (pg_no+1), 'size ', p.get_size()
        text=p.get_text().decode('UTF-8')
        locs=get_page_layout(p)
        fonts=p.get_text_attributes()
        offset=0
        cfont=0
        for line in text.splitlines(True):
            print ' ', line.encode('UTF-8'),
            n=len(line)
            for i in range(n):
                if line[i]==u'\n':
                    continue
                font=fonts[cfont]
                while font.start_index > i+offset or font.end_index < i+offset:
                    cfont+=1
                    if cfont>= len(fonts):
                        font=None
                        break
                    font=fonts[cfont]
                
                bb=locs[offset+i]
                print line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb,
                if font:
                    print font.font_name, font.font_size, 'r=%d g=%d, b=%d'%(font.color.red, font.color.green, font.color.blue),
            offset+=n
            print
                
        print
Пример #3
0
import os
import re
import logging
import tempfile
import io
from typing import Dict, Union
from distutils.version import LooseVersion

import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler, GLib

from . import abstract

poppler_version = Poppler.get_version()
if LooseVersion(poppler_version) < LooseVersion('0.46'):  # pragma: no cover
    raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
The installed version is %s." % poppler_version)  # pragma: no cover


class PDFParser(abstract.AbstractParser):
    mimetypes = {
        'application/pdf',
    }
    meta_list = {
        'author', 'creation-date', 'creator', 'format', 'keywords', 'metadata',
        'mod-date', 'producer', 'subject', 'title', 'viewer-preferences'
    }

    def __init__(self, filename):