def GetPageText(pg): dl = fitz.DisplayList() dv = fitz.Device(dl) pg.run(dv, fitz.Identity) ts = fitz.TextSheet() tp = fitz.TextPage() rect = pg.bound() dl.run(fitz.Device(ts, tp), fitz.Identity, rect) return tp.extractJSON()
ln = ln.next # we create a transformation matrix here zoom = int(sys.argv[3]) rotate = int(sys.argv[4]) trans = fitz.Matrix(zoom / 100.0, zoom / 100.0).preRotate(rotate) ''' here we introduce the display list, which provides caching-mechanisms to reduce parsing of a page. first, we need to create a display list hand it over to a list device and then populate the display list by running the page through that device, with transformation applied ''' mediabox = page.rect dl = fitz.DisplayList(mediabox) dv = fitz.Device(dl) page.run(dv, trans) # get the page size, and then apply the transformation rect = mediabox.transform(trans) # create a pixmap with RGB as colorspace and bounded by irect pm = fitz.Pixmap(fitz.Colorspace(fitz.CS_RGB), rect.round()) # clear it with 0xff white pm.clearWith(0xff) # fitz.Device(pm, None) is a device for drawing # we run the display list above through this drawing device # with area provided dl.run(fitz.Device(pm, None), fitz.Identity, rect)
#!/usr/bin/env python import fitz f = "sdw_2015_06.pdf" d = fitz.Document(f) seiten = d.pageCount for seite in range(seiten): print "=============== processing page", seite, " ===============" pg = d.loadPage(seite) dl = fitz.DisplayList() print "ok: dl = fitz.DisplayList()" dv = fitz.Device(dl) print "ok: dv = fitz.Device(dl)" pg.run(dv, fitz.Identity) print "ok: pg.run(dv, fitz.Identity)" ts = fitz.TextSheet() print "ok: ts = fitz.TextSheet()" tp = fitz.TextPage() print "ok: tp = fitz.TextPage()" rect = pg.bound() dl.run(fitz.Device(ts, tp), fitz.Identity, rect) print "ok: dl.run(fitz.Device(ts, tp), fitz.Identity, rect)"