def repair_pdf(origname, newname): """ Attempt to repair a PDF file. """ try: ifile = open(origname, "rb") except FileNotFoundError: return "File Not Found" # File not found idata = ifile.read() # put in memory ifile.close() ibuffer = BytesIO(idata) # convert to stream try: data = PdfReader(ibuffer) return "" # File did not need to be repaired except: # problem! heal it with PyMuPDF #print ("Error reading") doc = fitz.open("pdf", idata) # open and save a corrected try: fixed = doc.write(garbage=3, deflate=1, clean=1) # version in memory doc.close() doc = idata = None # free storage ibuffer = BytesIO(fixed) # convert to stream PdfWriter(newname, trailer=PdfReader(ibuffer)).write() return True # File has been Fixed except ValueError: return False
def __init__(self, parent, pdf_file): """ :param `pdf_file`: a File object or an object that supports the standard read and seek methods similar to a File object. Could also be a string representing a path to a PDF file. """ self.parent = parent if isinstance(pdf_file, string_types): # a filename/path string, pass the name to fitz.open pathname = pdf_file self.pdfdoc = fitz.open(pathname) else: # assume it is a file-like object, pass the stream content to fitz.open # and a '.pdf' extension in pathname to identify the stream type pathname = 'fileobject.pdf' if pdf_file.tell() > 0: # not positioned at start pdf_file.seek(0) stream = bytearray(pdf_file.read()) self.pdfdoc = fitz.open(pathname, stream) self.numpages = self.pdfdoc.pageCount page = self.pdfdoc.loadPage(0) self.pagewidth = page.bound().width self.pageheight = page.bound().height self.page_rect = page.bound() self.zoom_error = False #set if memory errors during render
def document(self): filename, fobj = get_filename_and_fobj(self.filename_or_fobj, mode="rb") if not filename: data = fobj.read() # TODO: may use a lot of memory doc = pymupdf.open(stream=data, filetype="pdf") else: doc = pymupdf.open(filename=filename, filetype="pdf") return doc
def return_image_obj(fs_path, memory=False): """ Given a Fully Qualified FileName/Pathname, open the image (or PDF) and return the PILLOW object for the image Fitz == py Args: fs_path (str) - File system path memory (bool) - Is this to be mapped in memory Returns: boolean:: `True` if uuid_to_test is a valid UUID, otherwise `False`. Raises: obj:: Pillow image object Examples -------- """ source_image = None if os.path.splitext(fs_path)[1][1:].lower() == u"pdf": results = pdf_utilities.check_pdf(fs_path) if results[0] == False: pdf_utilities.repair_pdf(fs_path, fs_path) pdf_file = fitz.open(fs_path) pdf_page = pdf_file.loadPage(0) pix = pdf_page.getPixmap(matrix=fitz.Identity, alpha=True) try: source_image = Image.open(BytesIO(pix.getPNGData())) except UserWarning: print ("UserWarning!") source_image = None else: if not memory: source_image = Image.open(fs_path) else: try:# fs_path is a byte stream source_image = Image.open(BytesIO(fs_path)) except IOError: print("IOError") log.debug("PIL was unable to identify as an image file") except UserWarning: print ("UserWarning!") source_image = None # if source_image.mode != "RGB": # source_image = source_image.convert('RGB') return source_image
def getPDFinfo(): spad.doc = fitz.open(spad.file) if spad.doc.needsPass: decrypt_doc() if spad.doc.isEncrypted: return True spad.seiten = spad.doc.pageCount spad.meta = {"author": "", "title": "", "subject": ""} for key, wert in spad.doc.metadata.items(): if wert: if pyversion < 3: spad.meta[key] = wert.decode("utf-8", "ignore") else: spad.meta[key] = wert else: spad.meta[key] = "" spad.fromjson = False spad.inhalt = spad.doc.getToC(simple=False) tocfile = spad.file + ".json" if os.path.exists(tocfile): d = wx.MessageDialog( None, "Saved data exist for this PDF - Use them instead?", "Input available from previous edit session", wx.YES_NO | wx.ICON_QUESTION, ) rc = d.ShowModal() d.Destroy() d = None if rc == wx.ID_YES: try: f_toc = open(tocfile) d = json.load(f_toc) f_toc.close() spad.fromjson = True spad.inhalt = d["toc"] spad.meta["author"] = d["author"] spad.meta["title"] = d["title"] spad.meta["subject"] = d["subject"] spad.meta["keywords"] = d["keywords"] except: d = wx.MessageDialog(None, "Ignoring saved data", "Invalid input from previous session") d.ShowModal() d.Destroy() d = None pass else: os.remove(tocfile) return False
def procesarPDF(nombreArchivoEntrada, listaMaterias, fdSalida): doc = fitz.open(nombreArchivoEntrada) # Crear un lector XML parser = xml.sax.make_parser() # Desactivar namespaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) Handler = OfertasGeneral(listaMaterias) # override the default ContextHandler parser.setContentHandler( Handler ) for num in range(0,doc.pageCount): # Procesar los PDFs usando MuPDF. Se extrae el texto del documento en # archivo XML. page = doc.loadPage(num) # Crear un archivo temporal try: f = open('textPDFXML1.xml', 'w') f.write(page.getText(output = "xml")) except OSError as ose: print("Error de E/S: ", ose) else: # Procesar el archivo XML parser.parse("textPDFXML1.xml") f.close() remove('textPDFXML1.xml') # Concatenar en un solo string e imprimir filas en formato CSV. for fil in Handler.subdividirFilas(): if fil: if len(fil) > 1: if isinstance(fil[1],tuple): acum = fil[0] + ',A' horariosOrdenados = sorted(fil[1:], key=ordenarDias) else: acum = ",".join(fil[:2]) horariosOrdenados = sorted(fil[2:], key=ordenarDias) if horariosOrdenados: acum += componerHorarioCSV(horariosOrdenados) + ',' else: acum += ',,,,,,Y' elif len(fil) == 1: acum = fil[0] + ',A,,,,,,Y' fdSalida.append(acum.split(',')) acum = ""
def getPDFinfo(): PDFcfg.doc = fitz.open(PDFcfg.file) if PDFcfg.doc.needsPass: decrypt_doc() if PDFcfg.doc.isEncrypted: return True PDFcfg.inhalt = PDFcfg.doc.getToC() PDFcfg.seiten = PDFcfg.doc.pageCount PDFmeta = {"author":"", "title":"", "subject":""} for key, wert in PDFcfg.doc.metadata.items(): if wert: PDFmeta[key] = wert.decode("utf-8", "ignore") else: PDFmeta[key] = "" PDFcfg.meta = PDFmeta return False
def __init__(self, url, background_color): super(PdfViewerWidget, self).__init__() self.url = url self.background_color = background_color # Load document first. self.document = fitz.open(url) # Get document's page information. self.first_pixmap = self.document.getPagePixmap(0) self.page_width = self.first_pixmap.width self.page_height = self.first_pixmap.height self.page_total_number = self.document.pageCount # Init scale and scale mode. self.scale = 1.0 self.read_mode = "fit_to_width" # Init scroll attributes. self.scroll_step = 20 self.scroll_offset = 0 self.mouse_scroll_offset = 20 # Padding between pages. self.page_padding = 10 # Init font. self.page_annotate_height = 22 self.page_annotate_padding_right = 10 self.page_annotate_padding_bottom = 10 self.page_annotate_color = QColor("#333333") self.font = QFont() self.font.setPointSize(12) # Page cache. self.page_cache_pixmap_dict = {} self.page_cache_scale = self.scale self.page_cache_trans = None self.page_cache_context_delay = 1000 self.last_action_time = 0 self.is_page_just_changed = False self.remember_offset = None
def return_image_obj(fs_path): fext = os.path.splitxt(fs_path)[1][1:].upper() if fext == "PDF": pdf_file = fitz.open(fs_path) pdf_page = pdf_file.loadPage(0) pix = pdf_page.getPixmap(matrix=fitz.Identify, colorspace="rgb", alpha=True) source_image = Image.open(BytesIO(pix.getPNGData())) else: source_image = Image.open(fs_path) if source_image.mode != "RGB": source_image = source_image.convert('RGB') return source_image
def check_pdf(filename): """ Use the PyMuPDF library to verify the structure of a PDF file. :param filename: The FQPN filename of the file in question to check :type filename: String :return: A Tuppple that contains * Boolean - Is Clean (True if no issue, False if issue) * Generic error message, eg. expected generation number * Raw Error message, eg expected generation number (25366 ? obj) :rtype: Tupple Generic Error message is filtered, to try to remove changing data, so that it can be used in the filtered excel report. .. code-block:: python >>> check_pdf(r"test_samples\\badpdf\\Administrative - 30 - Consent to Treat 02-16-07 - 7712.pdf") (False, 'expected generation number', 'expected generation number (25366 ? obj)') >>> check_pdf(r"test_samples\\badpdf\\Administrative - 30 - PayPol 05-27-08 - 7713.pdf") (False, 'expected generation number', 'expected generation number (17469 ? obj)') >>> check_pdf(r"test_samples\\goodpdf\\CCD_extract_101001-00.html.pdf") (True, '', '') >>> check_pdf(r"test_samples\\goodpdf\\CCD_extract_101002-00.html.pdf") (True, '', '') """ errmsg = "" try: pdffile = fitz.open(filename) raw_errmsg = pdffile.openErrMsg errorcode = pdffile.openErrCode except RuntimeError: # # A truly fatal error occurred, trap, assuming it's file not found. # (Need to verify FNF is the only condition this applies to.) raw_errmsg = "File Not Found" errorcode = -1 if raw_errmsg != "": # There is an error if "(" in raw_errmsg: # Does it have an (? errmsg = raw_errmsg[0:raw_errmsg.find("(")].strip() else: errmsg = raw_errmsg return (errorcode == 0, errmsg, raw_errmsg)
def scrape_urls(self): """Read the PDF, remove newlines, then replace 'http' with '\nhttp', then split by newline. Now we can walk over each element and run our url extraction method on each line. """ if self.local_file: pdf = fitz.open(self.local_file) for page in pdf: text_raw = page.getText() text_no_newlines = text_raw.replace('\n', '') text_with_newlines = text_no_newlines.replace('http', '\nhttp') lines = text_with_newlines.split('\n') for line in lines: if line.startswith('http'): url = self.extract_url_from_line(line) if url not in self.urls: self.urls.append(url)
def __init__(self, filepath, main_window, parent=None): super(PliantQGraphicsView, self).__init__(parent) self._translate = QtCore.QCoreApplication.translate self.parent = parent self.main_window = main_window self.image_pixmap = None self.image_cache = [None for _ in range(4)] self.thread = None self.annotation_dict = self.parent.metadata['annotations'] self.filepath = filepath self.filetype = os.path.splitext(self.filepath)[1][1:] if self.filetype == 'cbz': self.book = zipfile.ZipFile(self.filepath) elif self.filetype == 'cbr': self.book = rarfile.RarFile(self.filepath) elif self.filetype == 'pdf': self.book = fitz.open(self.filepath) self.common_functions = PliantWidgetsCommonFunctions( self, self.main_window) self.ignore_wheel_event = False self.ignore_wheel_event_number = 0 self.setMouseTracking(True) self.setDragMode(QtWidgets.QGraphicsView.ScrollHandDrag) self.setContextMenuPolicy(QtCore.Qt.CustomContextMenu) self.customContextMenuRequested.connect( self.generate_graphicsview_context_menu)
def read_book(self): self.book = fitz.open(self.filename)
def OneJpg2OnePdf(jpg_path, pdf_path, pdf_name): imgdoc = fitz.open(jpg_path) img_byte = imgdoc.convetToPdf() img_pdf = fitz.open("pdf", img_byte) img_pdf.save(pdf_path + '\\{}.pdf'.format(pdf_name)) img_pdf.close()
updated_end_index = end_index + pre_index mod_para_content = para_content[updated_start_index:updated_end_index] super_string = mod_para_content page = fitz_doc_obj[page_number - 1] super_area = page.searchFor(super_string) text_instances = page.searchFor(elem) for inst in text_instances: if bb_intersection_over_union(inst, super_area): specific_word_coordinates_list = [ inst[0], inst[1], inst[2], inst[3] ] break return specific_word_coordinates_list if __name__ == '__main__': pdf_path = "/Users/nr012/Downloads/2015-3.pdf" fitz_doc_obj = fitz.open(pdf_path) page_no = 1 start_index, end_index = 51, 60 para_content = "The following important factors, and other factors described elsewhere in this Report or contained in our other filings with the U.S. Securities and Exchange Commission (SEC), among others, could cause our results to differ materially from any results described in any forward-looking statements:." # start_index, end_index = 16, 28 #successfully in 1Page.pdf # para_content = "We believe that successfully meeting these objectives will generate financial performance exceeding that of our peers and result in full and fair valuation of our common shares." elem = para_content[start_index:end_index] specific_word_coordinates_list = [] if end_index == 0: specific_word_coordinates_list = [] else: specific_word_coordinates_list = get_specific_word_coords( para_content, page_no, elem, fitz_doc_obj, start_index, end_index) print(specific_word_coordinates_list)
import fitz doc = fitz.open("calvin2.pdf") for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG("p%s-%s.png" % (i, xref)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG("p%s-%s.png" % (i, xref)) pix1 = None pix = None
F1 = ' , '.join(map(str, F2)) return A1, P1, R1, F1 ########################################### EXTRACCION PDF #################################3 carp = "public" pdf = sys.argv[1] pdf = carp + pdf df = read_pdf(pdf, pages="2") tabula.convert_into(pdf, ("frmXYelim-291272" + '.csv'), output_format="csv", pages="2") pdf_documento = pdf documento = fitz.open(pdf_documento) pagina = documento.loadPage(1) text = pagina.getText("text") doc = (re.sub('[!?@#$()-.,;:*/0-9%"]+', ' ', text.lower())).split() ################################### WHILE PARA OBTNER EL TEXTO DESEADO ############ c = 0 m = 0 hecho = [] demanda = "" while (c < len(doc)): if (doc[c] == "hecho"): c = c + 1 while (c < len(doc)): if (doc[c] == "sumario"):
def make_pdf(dlg): # no file selected: treat like "QUIT" if not len(dlg.szr02.Table.data): # no files there - quit return None # create time zone value in PDF format cdate = fitz.getPDFnow() ausgabe = dlg.btn_aus.GetPath() pdf_out = fitz.open() # empty new PDF document aus_nr = 0 # current page number in output pdf_dict = {"creator": "PDF Joiner", "producer": "PyMuPDF", "creationDate": cdate, "modDate": cdate, "title": dlg.austit.Value, "author": dlg.ausaut.Value, "subject": dlg.aussub.Value, "keywords": dlg.keywords.Value} pdf_out.setMetadata(pdf_dict) # put in meta data total_toc = [] # initialize TOC #============================================================================== # process one input file #============================================================================== for zeile in dlg.szr02.Table.data: dateiname = zeile[0] doc = fitz.open(dateiname) max_seiten = len(doc) #============================================================================== # user input minus 1, PDF pages count from zero # also correct any inconsistent input #============================================================================== von = int(zeile[2]) - 1 # first PDF page number bis = int(zeile[3]) - 1 # last PDF page number von = min(max(0, von), max_seiten - 1) # "from" must be in range bis = min(max(0, bis), max_seiten - 1) # "to" must be in range rot = int(zeile[4]) # get rotation angle # now copy the page range pdf_out.insertPDF(doc, from_page = von, to_page = bis, rotate = rot) if dlg.noToC.Value: # no ToC wanted - get next file continue incr = 1 # standard increment for page range if bis < von: incr = -1 # increment for reversed sequence # list of page numbers in range pno_range = list(range(von, bis + incr, incr)) # standard bokkmark title = "infile [pp from-to of max.pages]" bm_main_title = "%s [pp. %s-%s of %s]" % \ (os.path.basename(dateiname[:-4]), von + 1, bis + 1, max_seiten) # insert standard bookmark ahead of any page range total_toc.append([1, bm_main_title, aus_nr + 1]) toc = doc.getToC(simple = False) # get file's TOC last_lvl = 1 # immunize against hierarchy gaps for t in toc: lnk_type = t[3]["kind"] # if "goto", page must be in range if (t[2] - 1) not in pno_range and lnk_type == fitz.LINK_GOTO: continue if lnk_type == fitz.LINK_GOTO: pno = pno_range.index(t[2] - 1) + aus_nr + 1 # repair hierarchy gaps by filler bookmarks while (t[0] > last_lvl + 1): total_toc.append([last_lvl + 1, "<>", pno, t[3]]) last_lvl += 1 last_lvl = t[0] t[2] = pno total_toc.append(t) aus_nr += len(pno_range) # increase output counter doc.close() doc = None #============================================================================== # all input files processed #============================================================================== if total_toc: pdf_out.setToC(total_toc) pdf_out.save(ausgabe) pdf_out.close() return ausgabe
Main purpose of this function is to demonstrate that working with PyMuPDF is easy and straightforward ... What does introduce some complexity is the ability to scale, and to left-right flip the image while maintaining the text legible. ------------------------------------------------------------------------------- New (2017-09-21): ----------------- Scaling and other morphing effects can now also be achieved with a morphing matrix. This is possible after page method "insertTextbox" also supports this. ------------------------------------------------------------------------------- """ #============================================================================== # invoke the pencil function #============================================================================== if __name__ == "__main__": doc=fitz.open() # empty new PDF page = doc.newPage() # create page (A4) img = page.newShape() # create shape # ============================================================================= # pencil 1 # ============================================================================= penheight = 100 # thickness of pencil pentip = fitz.Point(100, 150) # first pencil tip here pencil(img, pentip, penheight, True) # pencil points left # ============================================================================= # pencil 2 # ============================================================================= penheight = 20 # now a smaller one pentip = fitz.Point(100, 250) # new pencil tip pencil(img, pentip, penheight, False) # this one points right
for root, dirs, files in os.walk(IN_DIR, topdown=False): for name in files: all_pdf_files.append(join(root, name)) with open("out.csv", "w") as file: file.write( f"\"Číslo pracovnej cesty\"; \"Meno a priezvisko\"; \"EVČ\"; \"Suma\"\n" ) for pdf_file in all_pdf_files: print(f"File: {pdf_file}") with fitz.open(pdf_file) as doc: text = "" for page in doc: text += page.getText() print(text) # ID x = re.findall("\nVyúčtovanie pracovnej cesty č. [0-9]+\n", text) print(x) cp_id = x[0].split(" ")[-1].strip() print(f"ID: {cp_id}") file.write(f"\"{cp_id}\";") # Meno x = re.findall("\nPriezvisko, meno, titul:\n.+\n", text)
""" Demo / Experimental: Replace the fonts in a PDF. """ import fitz import sys fname = sys.argv[1] doc = fitz.open(fname) # input PDF out = fitz.open() # output PDF csv = open("fonts.csv").read().splitlines() all_fonts = [] # will contain: (old basefont name, Base14 name) for f in csv: all_fonts.append(f.split(";")) def pdf_color(srgb): """Create a PDF color triple from a given sRGB color integer. """ b = (srgb % 256) / 255 srgb /= 256 g = (srgb % 256) / 255 srgb /= 256 r = srgb / 255 return (r, g, b) def get_font(fontname): """Lookup base fontname and return one of the "reserved" Base14 fontnames. """
def pdf2pic(path, pic_path): ''' ''' t0 = time.perf_counter() checkXO = r'/Type(?= */XObject)' checkIM = r'/Subtype(?= */Image)' doc = fitz.open(path) imgcount = 0 total_img_cnt = 0 # help(doc) lenXREF = doc._getXrefLength() print(f'path:({path}) pages:({len(doc)}) object:({lenXREF-1})') for i in range(1, lenXREF): text = doc._getXrefString(i) isXObject = re.search(checkXO, text) isImage = re.search(checkIM, text) # print(f'[{i}]:text:({text})') if not isXObject or not isImage: continue print(f'[{i}]:--------------') print(f'[{i}]:text:({text})') total_img_cnt += 1 pix = fitz.Pixmap(doc, i) print(f'[{i}]:pix:({pix})') print(f'[{i}]:pix.colorspace:({pix.colorspace})') # help(pix) cs = pix.colorspace print(f'[{i}]:cs:name({cs.name})) value({cs.n}) pix.n({pix.n})') if cs.n == 1: # csGRAY print(f'[{i}]:ignore gray image.') elif cs.n == 2: # unknown print(f'[{i}]:unknown colorspace.({cs})') elif cs.n == 3: # csRGB imgcount += 1 new_path = new_img_path(path, pic_path, imgcount) pix.writePNG(new_path) elif cs.n == 4: # csCMYK imgcount += 1 new_path = new_img_path(path, pic_path, imgcount) pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(new_path) pix0 = None else: print(f'[{i}]:error.unknown colorspace({cs})') pix = None t1 = time.perf_counter() print(f'found ({total_img_cnt}) images,({imgcount}) exported.') print(f'done.needs ({t1-t0}) secs')
def valid(upload_pics='.'): def get_content_ocr(ocrObject): strs = '' try: for item in ocrObject['words_result']: strs += item['words'] + '\n' return strs except: return '' pdf_path = '2020劳动合同范文.pdf.pdf' doc = fitz.open(pdf_path) # x = doc[0].getImageList() x = doc[0].getPixmap() valid_pdf_img = [item.getPixmap() for item in doc] doc.close() # src_pdf_text = [get_content_ocr(client.basicAccurate(item.getImageData(output='png'))) for item in valid_pdf_img] # 每页原pdf的识别文字 # src_pdf_text = ['\n'.join([text['words'] for text in client.basicAccurate(item.getImageData(output='png'))['words_result']]) for item in valid_pdf_img] src_pdf_text = [ get_content_ocr(client.basicAccurate(item.getImageData(output='png'))) for item in valid_pdf_img ] upload_img = [] for i in os.listdir('./pics'): with open('pics/' + i, 'rb') as f: upload_img.append(f.read()) upload_text = [ get_content_ocr(client.basicAccurate(item)) for item in upload_img ] # 排序 upload_text_sort = [] for src_text in src_pdf_text: similaritys = [] for usr_text in upload_text: similarity = get_equal_rate(src_text, usr_text) similaritys.append(similarity) index = [ i for (i, item) in enumerate(similaritys) if item == max(similaritys) ][0] upload_text_sort.append(upload_text[index]) compare_html = os.path.join('.', 'diff.html') compare_sum = 0 print(len(src_pdf_text)) # src_pdf_text, upload_text_sort = src_pdf_text[0].split('\n'),upload_text_sort[0].split('\n') for src_txt, up_txt in zip(src_pdf_text, upload_text_sort): print('#' * 35) print('src_txt:', src_txt) src_remove_char, up_remove_char = compares(src_txt), compares(up_txt) similarity_remove = get_equal_rate(src_remove_char, up_remove_char) compare_sum += similarity_remove diff = difflib.HtmlDiff() result = diff.make_file(src_txt.split('\n'), up_txt.split('\n')) try: if similarity_remove < 1: fd_diff = open(os.path.join('.', 'diff.html'), "a", encoding='utf-8') fd_diff.write(result) fd_diff.close() else: pass except Exception as e: import traceback traceback.print_exc()
rc = False if str is bytes: imgdir = sys.argv[1] # where my files are else: rc, imgdir = psg.GetPathBox("Make a PDF from Attached Files", "Enter file directory:") if not imgdir: raise SystemExit() t0 = mytime() # set start timer width, height = fitz.PaperSize("a6-l") # get paper format doc = fitz.open() # open empty PDF page = doc.newPage(width = width, # make new page height = height) # define sub rect to receive text and annotation symbols rect = fitz.Rect(0, 0, width, height) + (36, 36, -36, -36) imglist = os.listdir(imgdir) # directory listing imgcount = len(imglist) # number of files # calculate number of pages we will create per_page = ((width - 72) // 25) * ((height - 36 - 56) // 35) pages = int(round(imgcount / per_page + 0.5)) # header text text = "Contains the following %i files from '%s':\n\n" % (imgcount, imgdir)
highlight = "this text is highlighted" underline = "this text is underlined" strikeout = "this text is striked out" squiggled = "this text is zigzag-underlined" red = (1, 0, 0) blue = (0, 0, 1) gold = (1, 1, 0) green = (0, 1, 0) displ = fitz.Rect(0, 50, 0, 50) r = fitz.Rect(72, 72, 220, 100) t1 = u"têxt üsès Lätiñ charß,\nEUR: €, mu: µ, super scripts: ²³!" font = fitz.Font("helv") # used by the TextWriter class doc = fitz.open() page = doc.newPage() page.setRotation(0) # following makes sure that TextWriter references the **unrotated** page rect # as everything else does ... page_rect = page.rect * page.derotationMatrix def print_descr(annot): """Print a short description to the right of the annot rect.""" rect = annot.rect page = annot.parent writer = fitz.TextWriter(page_rect, color=red) writer.append(rect.br + (10, -5),
def __init__(self, parent, filename): defPos = wx.DefaultPosition defSiz = wx.DefaultSize zoom = 1.2 # zoom factor of display wx.Dialog.__init__ (self, parent, id = wx.ID_ANY, title = u"Display with PyMuPDF: ", pos = defPos, size = defSiz, style = wx.CAPTION|wx.CLOSE_BOX| wx.DEFAULT_DIALOG_STYLE) #====================================================================== # display an icon top left of dialog, append filename to title #====================================================================== if do_icon: self.SetIcon(ico_pdf.img.GetIcon()) # set a screen icon self.SetTitle(self.Title + filename) self.SetBackgroundColour(wx.Colour(240, 230, 140)) #====================================================================== # open the document with MuPDF when dialog gets created #====================================================================== self.doc = fitz.open(filename) # create Document object if self.doc.needsPass: # check password protection self.decrypt_doc() if self.doc.isEncrypted: # quit if we cannot decrpt self.Destroy() return self.dl_array = [0] * len(self.doc) self.last_page = -1 # memorize last page displayed self.link_rects = [] # store link rectangles here self.link_texts = [] # store link texts here self.current_idx = -1 # store entry of found rectangle self.current_lnks = [] # store entry of found rectangle #====================================================================== # define zooming matrix for displaying PDF page images # we increase images by 20%, so take 1.2 as scale factors #====================================================================== self.matrix = fitz.Matrix(zoom, zoom) # will use a constant zoom ''' ======================================================================= Overall Dialog Structure: ------------------------- szr10 (main sizer for the whole dialog - vertical orientation) +-> szr20 (sizer for buttons etc. - horizontal orientation) +-> button forward +-> button backward +-> field for page number to jump to +-> field displaying total pages +-> PDF image area ======================================================================= ''' # forward button self.ButtonNext = wx.Button(self, wx.ID_ANY, u"forw", defPos, defSiz, wx.BU_EXACTFIT) # backward button self.ButtonPrevious = wx.Button(self, wx.ID_ANY, u"back", defPos, defSiz, wx.BU_EXACTFIT) #====================================================================== # text field for entering a target page. wx.TE_PROCESS_ENTER is # required to get data entry fired as events. #====================================================================== self.TextToPage = wx.TextCtrl(self, wx.ID_ANY, u"1", defPos, wx.Size(40, -1), wx.TE_RIGHT|wx.TE_PROCESS_ENTER) # displays total pages and page paper format self.statPageMax = wx.StaticText(self, wx.ID_ANY, "of " + str(len(self.doc)) + " pages.", defPos, defSiz, 0) self.links = wx.CheckBox( self, wx.ID_ANY, u"show links", defPos, defSiz, wx.ALIGN_LEFT) self.links.Value = True self.paperform = wx.StaticText(self, wx.ID_ANY, "", defPos, defSiz, 0) # define the area for page images and load page 1 for primary display self.PDFimage = wx.StaticBitmap(self, wx.ID_ANY, self.pdf_show(1), defPos, defSiz, style = 0) #====================================================================== # the main sizer of the dialog #====================================================================== self.szr10 = wx.BoxSizer(wx.VERTICAL) szr20 = wx.BoxSizer(wx.HORIZONTAL) szr20.Add(self.ButtonNext, 0, wx.ALL, 5) szr20.Add(self.ButtonPrevious, 0, wx.ALL, 5) szr20.Add(self.TextToPage, 0, wx.ALL, 5) szr20.Add(self.statPageMax, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5) szr20.Add( self.links, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5 ) szr20.Add(self.paperform, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5) # sizer ready, represents top dialog line self.szr10.Add(szr20, 0, wx.EXPAND, 5) self.szr10.Add(self.PDFimage, 0, wx.ALL, 5) # main sizer now ready - request final size & layout adjustments self.szr10.Fit(self) self.SetSizer(self.szr10) self.Layout() # center dialog on screen self.Centre(wx.BOTH) # Bind buttons and fields to event handlers self.ButtonNext.Bind(wx.EVT_BUTTON, self.NextPage) self.ButtonPrevious.Bind(wx.EVT_BUTTON, self.PreviousPage) self.TextToPage.Bind(wx.EVT_TEXT_ENTER, self.GotoPage) self.PDFimage.Bind(wx.EVT_MOUSEWHEEL, self.OnMouseWheel) self.PDFimage.Bind(wx.EVT_MOTION, self.move_mouse) self.PDFimage.Bind(wx.EVT_LEFT_DOWN, self.OnLeftDown)
def get_pdf_text(self, path): text = "" doc = fitz.open(path) for page in doc: text += page.getText(flags=0) + " " return text.lower()
while (length < len(lst_text)): run = table.cell(2, 1).paragraphs[0].add_run('***') run.font.color.rgb = RGBColor(255, 0, 0) run = table.cell(2, 1).paragraphs[0].add_run(' ') length += 1 while (length < len(lst_ocr)): run = table.cell(2, 1).paragraphs[0].add_run(lst_ocr[length]) run.font.color.rgb = RGBColor(255, 0, 0) run = table.cell(2, 1).paragraphs[0].add_run(' ') length += 1 file = os.listdir('uploads') PDF = 'uploads/' + file[0] pdfDocument = fitz.open(PDF) document = Document() pageNum = pdfDocument.pageCount textPDF = '' textOCR = '' textDiff = '' documentPDF = "ready/textPDF.txt" documentOCR = "ready/textOCR.txt" documentRES = 'ready/RESULT.doc' # textPDF = textFromPDF(pdfDocument) doImage(PDF) for i in range(pageNum): file_png = "page" + str(i) + ".jpg" tmpTextOCR = text_EASYOCR(file_png) # tmpTextOCR = textFromPDF_OCR_1(file_png)
Export Script toc2csv.py ------------------------- import fitz import argparse #-------------------------------------------------------------------- # use argparse to handle invocation arguments #-------------------------------------------------------------------- parser = argparse.ArgumentParser(description="Enter CSV delimiter [;] and documment filename") parser.add_argument('-d', help='CSV delimiter [;]', default = ';') parser.add_argument('doc', help='document filename') args = parser.parse_args() delim = args.d # requested CSV delimiter character fname = args.doc # input document filename doc = fitz.open(fname) toc = doc.getToC(simple = False) ext = fname[-3:].lower() fname1 = fname[:-4] + "-toc.csv" outf = open(fname1, "w") for t in toc: t4 = t[3] if ext == "pdf": if t4["kind"] == 1: p4 = str(t4["to"].y) # add vertical destination if present else: p4 = "" else: p4 = "" rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4]) outf.writelines([rec, "\n"]) outf.close()
msg = ["%i glyphs" % font.glyph_count, "size %i" % len(font.buffer)] if flags["mono"] == 1: msg.append("mono") if flags["serif"]: msg.append("serifed") if flags["italic"]: msg.append("italic") if flags["bold"]: msg.append("bold") msg = ", ".join(msg) return msg infilename = sys.argv[1] font_list = set() doc = fitz.open(infilename) for i in range(len(doc)): for f in doc.getPageFontList(i, full=True): msg = "" subset, fontname = get_fontnames(doc, f) if f[1] == "n/a": msg = "Not embedded!" else: extr = doc.extractFont(f[0]) font = fitz.Font(fontbuffer=extr[-1]) msg = make_msg(font) if subset: msg += ", subset font" font_list.add((fontname, msg))
def fill_in_fake_data_on_exams(paper_dir_path, classlist, outfile, which=None): """Fill-in exams with fake data for demo or testing. Arguments: paper_dir_path {Str or convertable to pathlib obj} -- Directory containing the blank exams. classlist (list): ordered list of (sid, sname) pairs. outfile {Str} -- Path to write results into this concatenated PDF file. Keyword Arguments: which {type} -- by default, scribble on all exams or specify something like `which=range(10, 16)` here to scribble on a subset. (default: {None}) """ # Customizable data blue = [0, 0, 0.75] student_number_length = 8 extra_page_probability = 0.2 digit_font_size = 24 answer_font_size = 13 extra_page_font_size = 18 # We create the path objects paper_dir_path = Path(paper_dir_path) out_file_path = Path(outfile) print("Annotating papers with fake student data and scribbling on pages...") if not which: named_papers_paths = glob( str(paper_dir_path / "exam_*_*.pdf") ) # those with an ID number papers_paths = sorted(glob(str(paper_dir_path / "exam_*.pdf"))) # everything else: papers_paths = sorted( [ paper_dir_path / "exam_{}.pdf".format(str(index).zfill(4)) for index in which ] ) used_id_list = [] # need to avoid any student numbers already used to name papers - look at file names for index, file_name in enumerate(named_papers_paths): used_id_list.append(os.path.split(file_name)[1].split(".")[0].split("_")[-1]) # now load in the student names and numbers -only those not used to prename clean_id_dict = {} # not used for sid, sname in classlist: if sid not in used_id_list: clean_id_dict[sid] = sname # now grab a random selection of IDs from the dict. # we need len(papers_paths) - len(named_papers_paths) of them id_sample = random.sample( list(clean_id_dict.keys()), len(papers_paths) - len(named_papers_paths) ) # A complete collection of the pdfs created all_pdf_documents = fitz.open() clean_count = 0 for index, file_name in enumerate(papers_paths): if file_name in named_papers_paths: print("{} - prenamed paper - scribbled".format(os.path.basename(file_name))) else: student_number = id_sample[clean_count] student_name = clean_id_dict[student_number] clean_count += 1 print( "{} - scribbled using {} {}".format( os.path.basename(file_name), student_number, student_name ) ) # TODO: bump pymupdf minimum version to 1.17.2 and do: # with fitz.open(file_name) as pdf_document: pdf_document = fitz.open(file_name) front_page = pdf_document[0] # First we input the student names if file_name not in named_papers_paths: # can draw on front page # insert digit images into rectangles - some hackery required to get correct positions. width = 28 border = 8 for digit_index in range(student_number_length): rect1 = fitz.Rect( 220 + border * digit_index + width * digit_index, 265, 220 + border * digit_index + width * (digit_index + 1), 265 + width, ) uuImg = digit_array[ int(student_number[digit_index]) * number_of_digits + random.randrange(number_of_digits) ] # uu-encoded png img_BString = base64.b64decode(uuImg) front_page.insertImage(rect1, stream=img_BString, keep_proportion=True) # TODO - there should be an assert or something here? digit_rectangle = fitz.Rect(228, 335, 550, 450) insertion_confirmed = front_page.insertTextbox( digit_rectangle, student_name, fontsize=digit_font_size, color=blue, fontname="Helvetica", fontfile=None, align=0, ) assert insertion_confirmed > 0 # Write some random answers on the pages for page_index, pdf_page in enumerate(pdf_document): random_answer_rect = fitz.Rect( 100 + 30 * random.random(), 150 + 20 * random.random(), 500, 500 ) random_answer_text = random.choice(possible_answers) # TODO: "helv" vs "Helvetica" if page_index >= 1: insertion_confirmed = pdf_page.insertTextbox( random_answer_rect, random_answer_text, fontsize=answer_font_size, color=blue, fontname="helv", fontfile=None, align=0, ) assert insertion_confirmed > 0 # delete last page from the zeroth test. if index == 0: pdf_document.deletePage(-1) print("Deleting last page of test {}".format(file_name)) # We then add the pdfs into the document collection all_pdf_documents.insertPDF(pdf_document) # For a comprehensive test, we will add some extrapages with the probability of 0.2 precent if random.random() < extra_page_probability: # folder_name/exam_XXXX.pdf or folder_name/exam_XXXX_YYYYYYY.pdf, # file_pdf_name drops the folder name and the .pdf parts file_pdf_name = os.path.splitext(os.path.basename(file_name))[0] # Then we get the test number and student_number from file_pdf_name test_number = file_pdf_name.split("_")[1] if ( file_name in named_papers_paths ): # file_pdf_name is exam_XXXX_YYYYYYY.pdf student_number = file_pdf_name.split("_")[2] print( " making an extra page for test {} and sid {}".format( test_number, student_number ) ) all_pdf_documents.insertPage( -1, text="EXTRA PAGE - t{} Q1 - {}".format(test_number, student_number), fontsize=extra_page_font_size, color=blue, ) # need to use `str(out_file_path)` for pumypdf < 1.16.14 # https://github.com/pymupdf/PyMuPDF/issues/466 # Here we only need to save the generated pdf files with random test answers all_pdf_documents.save(out_file_path) print('Assembled in "{}"'.format(out_file_path))
It scans through all objects and selects /Type/XObject with /Subtype/Image. So runtime is determined by number of objects and image volume. Usage: extract_img2.py input.pdf ''' from __future__ import print_function import fitz import sys, time, re checkXO = r"/Type(?= */XObject)" # finds "/Type/XObject" checkIM = r"/Subtype(?= */Image)" # finds "/Subtype/Image" assert len(sys.argv) == 2, 'Usage: %s <input file>' % sys.argv[0] t0 = time.clock() doc = fitz.open(sys.argv[1]) imgcount = 0 lenXREF = doc._getXrefLength() # number of objects - do not use entry 0! # display some file info print("file: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF - 1)) for i in range(1, lenXREF): # scan through all objects text = doc._getObjectString(i) # string defining the object isXObject = re.search(checkXO, text) # tests for XObject isImage = re.search(checkIM, text) # tests for Image if not isXObject or not isImage: # not an image object if not both True continue pix = fitz.Pixmap(doc, i) # make pixmap from image if pix.colorspace is None: # this is just a mask!
if flags & 2**1: l.append("italic") if flags & 2**2: l.append("serifed") else: l.append("sans") if flags & 2**3: l.append("monospaced") else: l.append("proportional") if flags & 2**4: l.append("bold") return ", ".join(l) doc = fitz.open("text-tester.pdf") page = doc[0] # read page text as a dictionary, suppressing extra spaces in CJK fonts blocks = page.get_text("dict", flags=11)["blocks"] for b in blocks: # iterate through the text blocks for l in b["lines"]: # iterate through the text lines for s in l["spans"]: # iterate through the text spans print("") font_properties = "Font: '%s' (%s), size %g, color #%06x" % ( s["font"], # font name flags_decomposer(s["flags"]), # readable font flags s["size"], # font size s["color"], # font color ) print("Text: '%s'" % s["text"]) # simple print of text
def generateDoc(title, data, dataStart, dataEnd, bio1, bio2, bio3, bio4, vid_filename, textfile, filename, pageNum, path): # Creates a new blank PDF doc = fitz.open() generatedPage = doc.newPage() font = "Times-Roman" fontSize = 24 titleLength = fitz.getTextlength(title, font, fontSize) # Prints the dimensions of the newly generated page. # These values may be useful for determining the locations of the plots pageRect = generatedPage.bound() page_x0 = pageRect.x0 page_x1 = pageRect.x1 # Ensures that the title will always be centered, despite text length pageMidpoint_X = (page_x1 - page_x0) / 2 titleStartPoint_X = pageMidpoint_X - (titleLength / 2) titleStartPoint_Y = fontSize + 11 titleStartPoint = fitz.Point(titleStartPoint_X, titleStartPoint_Y) generatedPage.insertText(titleStartPoint, title, fontname=font, fontsize=fontSize, rotate=0) # Autogenerates the biometric plots bio1_filename = path + "page%i_" % pageNum + bio1 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio1, bio1_filename) bio2_filename = path + "page%i_" % pageNum + bio2 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio2, bio2_filename) bio3_filename = path + "page%i_" % pageNum + bio3 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio3, bio3_filename) bio4_filename = path + "page%i_" % pageNum + bio4 + ".png" bioPlotter.plotBiometric(data, dataStart, dataEnd, bio4, bio4_filename) # Autogenerates the EEG heatmaps eeg.eeg_viz(data, dataStart, dataEnd, path + "page%i_eeg_" % pageNum) fontSize = 14 # Extracts a frame from the video in the specified time range # extracted_frame_filename = path + "page%i_extracted_frame.jpg"% pageNum # vidFrame.extractFrame(vid_filename, dataStart, dataEnd, extracted_frame_filename) # Inserts heatmap visualizations heatmapAlpha_Location = fitz.Rect(10, 50, 198, 238) generatedPage.insertImage(heatmapAlpha_Location, filename=path + "page%i_eeg_alpha.png" % pageNum, keep_proportion=False) alphaText = "Alpha Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((10 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, alphaText, fontname=font, fontsize=fontSize, rotate=0) heatmapBeta_Location = fitz.Rect(203, 50, 391, 238) generatedPage.insertImage(heatmapBeta_Location, filename=path + "page%i_eeg_beta.png" % pageNum, keep_proportion=False) betaText = "Beta Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((203 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, betaText, fontname=font, fontsize=fontSize, rotate=0) heatmapTheta_Location = fitz.Rect(396, 50, 585, 238) generatedPage.insertImage(heatmapTheta_Location, filename=path + "page%i_eeg_theta.png" % pageNum, keep_proportion=False) thetaText = "Theta Band" textLength = fitz.getTextlength(alphaText, font, fontSize) startPoint = fitz.Point(((396 + 94) - textLength / 2), 240) generatedPage.insertText(startPoint, thetaText, fontname=font, fontsize=fontSize, rotate=0) # Inserts biometric plots bio3_Location = fitz.Rect(10, 443, 300, 653) generatedPage.insertImage(bio3_Location, filename=bio3_filename, keep_proportion=False) bio4_Location = fitz.Rect(305, 443, 595, 653) generatedPage.insertImage(bio4_Location, filename=bio4_filename, keep_proportion=False) bio1_Location = fitz.Rect(10, 253, 300, 463) generatedPage.insertImage(bio1_Location, filename=bio1_filename, keep_proportion=False) bio2_Location = fitz.Rect(305, 253, 595, 463) generatedPage.insertImage(bio2_Location, filename=bio2_filename, keep_proportion=False) # Generates textbox textboxBack_Location = fitz.Rect(250, 650, 585, 815) textHandler.createTextbox(textfile, textboxBack_Location, generatedPage, path) # Inserts video frame vidFrame_Location = fitz.Rect(25, 675, 245, 799) # generatedPage.insertImage(vidFrame_Location, filename=extracted_frame_filename, keep_proportion=False) generatedPage.insertImage(vidFrame_Location, filename=vid_filename, keep_proportion=False) # Saves the PDF -- not needed anymore #doc.save(filename) return doc
# create color list sorted down by hue, value, saturation mylist = sorted(getColorInfoList(), reverse = True, key=lambda x: sortkey(x)) w = 800 # page width h = 600 # page height rw = 80 # width of color rect rh = 60 # height of color rect num_colors = len(mylist) # number of color triples black = getColor("black") # text color white = getColor("white") # text color fsize = 8 # fontsize lheight = fsize *1.2 # line height idx = 0 # index in color database doc = fitz.open() # empty PDF while idx < num_colors: doc.insertPage(-1, width = w, height = h) # new empty page page=doc[-1] # load it for i in range(10): # row index if idx >= num_colors: break for j in range(10): # column index rect = fitz.Rect(rw*j, rh*i, rw*j + rw, rh*i + rh) # color rect cname = mylist[idx][0].lower() # color name col = mylist[idx][1:] # color tuple -> to floats col = (col[0] / 255., col[1] / 255., col[2] / 255.) page.drawRect(rect, color = col, fill = col) # draw color rect pnt1 = rect.top_left + (0, rh*0.3) # pos of color name in white pnt2 = pnt1 + (0, lheight) # pos of color name in black page.insertText(pnt1, cname, fontsize = fsize, color = white)
def getText(file): pdf = fitz.open(file) # numPages = pdf.pageCount page = pdf.loadPage(0) pageText = page.getText("text") return pageText
line = 0 def get_info(tx, par): if par == 'I': for ln in tx: if ln[0:15] == 'Certificate Id:' or ln[0:15] == 'Certificate No:': return ln[16:] elif par == 'C': for ln in tx: if ln[0:19] == 'Course completed on': return ln for certificate in os.listdir(path=pth): if certificate.endswith('.pdf'): line += 1 pdfFileObj = fitz.open(pth + "/" + certificate) pageObj = pdfFileObj.loadPage(0) pageText = pageObj.getText() textlines = pageText.split('\n') worksheet["A" + str(line)] = textlines[2] worksheet["B" + str(line)] = get_info(textlines, 'C') worksheet["C" + str(line)] = get_info(textlines, 'I') workbook.save("Certificates.xlsx")
# 4:Sig 2:NbDisk 2: NbCD 2:TotalDisk 2:TotalCD # 4:CDSize 4:Offset 2:ComLen offset = filedata.rfind("PK\5\6") + 20 # new comment length length = len(filedata) - offset - 2 with open(name, "wb") as f: f.write(filedata[:offset]) f.write(struct.pack("<H", length)) f.write(filedata[offset + 2:]) pdf, attach = sys.argv[1:3] doc = fitz.open(pdf) with open(attach, 'rb') as f: data = f.read() if ATTACHED: # add as attachment createAttachment(doc, attach, data) doc.saveIncr() else: # add as extra stream # appending one null byte to terminate the archive comment addStreamData(doc, data + "\0") # 255 = decompress all objects doc.save(doc.name, incremental=True, expand=255) doc.close()
driver.close() #%% EXTRACTING TEXT BLOCKS ''' This block extracts all paragraphs from EY VAT tax guides ''' files = os.listdir(main_dir + tax_guides) # EY worldwide tax guide directory personal_ind = 'worldwide-vat-gst-and-sales-tax-guide-' # VAT tax guide indicator text = [] filename_aux = [] for file in files: if file[0:len(personal_ind)] == personal_ind: # only corporate tax guides doc = fitz.open(file) num_pages = doc.pageCount for page in range(0, num_pages): text_ = doc.loadPage(page).getText("blocks") text_ = [x[4] for x in text_] # 4 = element with string in tuple text.extend(text_) for te in text_: filename_aux.append( file) # allows to identify tax guide and year #%% GETTING YEAR + COUNTRYNAME ''' In this block I will identify the country and the year from both the filename and the text within each file. '''
pix2.n == 1): print("unexpected /SMask situation: pix1", pix1, "pix2", pix2) return pix1 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha values pix1 = pix2 = None # free temp pixmaps return pix checkXO = r"/Type(?= */XObject)" # finds "/Type/XObject" checkIM = r"/Subtype(?= */Image)" # finds "/Subtype/Image" t0 = time.clock() fname = sys.argv[1] # file name fpref = sys.argv[2] # image file prefix doc = fitz.open(sys.argv[1]) imgcount = 0 lenXREF = doc._getXrefLength() # object count - do not use entry 0! # display some file info print("") print(__file__, "PDF: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1)) smasks = [] # stores xrefs of /SMask objects #------------------------------------------------------------------------------ # loop through PDF images #------------------------------------------------------------------------------ for i in range(1, lenXREF): # scan through all objects try: text = doc._getObjectString(i) # PDF object definition string except:
from __future__ import print_function import fitz import sys #============================================================================== # Pie Chart program - semi circle version #============================================================================== from fitz.utils import getColor # for getting RGB colors by name doc = fitz.open() # new empty PDF doc.insertPage() # creates an ISO-A4 page page = doc[-1] # this is the page img = page.newShape() # title of the page title = "Sitzverteilung nach der Bundestagswahl 2013" # pie chart center and point of 1st data pie center = fitz.Point(200, 250) point = fitz.Point(100, 250) # will cycle through table data # this is the radius radius = abs(point - center) blue = getColor("blue") # we need some colors white = getColor("white") lineheight = 20 # legend line height ts_v = 150 # vertical start of legend block ts_h = center.x + radius + 50 # horizontal coord of legend block # these are the data to visualize: # number of seats of political parties in German parliament since 2013 table = ( # seats, party color & name (64, "violetred", "Die Linke"),
import fitz doc = fitz.open("some.pdf") # open pdf page = doc[n] # open the page (0-based number) rtab = [] # store all rectangles here annot = page.firstAnnot # read first annotation while annot: rtab.append(annot.rect) # store rectangle annot = annot.next # read next annot annot = page.firstAnnot # cycle thru annots again for rect in reversed(rtab): annot.setRect(rect) # give it a new place annot = annot.next doc.save("some-reversed.pdf") # save PDF with reversed annotations
# -*- coding: utf-8 -*- """ PyMuPDF Example Script: ------------------------ Split a given PDF into separate files of one page each. For "input.pdf" the generated files are named "input-%i.pdf". PyMuPDF license """ import fitz import sys fn = sys.argv[1] fn1 = fn[:-4] src = fitz.open(fn) for i in range(len(src)): doc = fitz.open() doc.insertPDF(src, from_page = i, to_page = i) doc.save("%s-%i.pdf" % (fn1, i)) doc.close()
def pdf2pic(pdf_path, pic_path): """ # 从pdf中提取图片 :param pdf_path: pdf的路径 :param pic_path: 图片保存的路径 :return: """ # pic_path = r'C:\Users\big\Desktop\tt' t0 = time.perf_counter() # 生成图片初始时间 # 使用正则表达式来查找图片 checkXO = r"/Type(?= */XObject)" checkIM = r"/Subtype(?= */Image)" doc = fitz.open(pdf_path) # 打开pdf文件 img_count = 0 # 图片计数 len_XREF = doc._getXrefLength() # 获取对象数量长度 # 打印PDF的信息 print("文件名:{}, 页数: {}, 对象: {}".format(pdf_path, len(doc), len_XREF - 1)) c1, c2 = 170, 50 # 遍历每一个对象 # for i in range(1, 7780): # for i in range(len_XREF - 1, 0, -1): for i in range(1, len_XREF): text = doc._getXrefString(i) # 定义对象字符串 isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是对象 isImage = re.search(checkIM, text) # 使用正则表达式查看是否是图片 if not isXObject or not isImage: # 如果不是对象也不是图片,则continue continue img_count += 1 # print(i, img_count) # continue pix = fitz.Pixmap(doc, i) # 根据索引生成图像 生成图像对象 # print(type(pix), pix.w, pix.pixel(1, 2)) # return for x in range(pix.w): for y in range(pix.h): if pix.pixel(x, y)[0] > c1: pix.setPixel(x, y, [254, 254, 254]) else: pix.setPixel(x, y, [c2, c2, c2]) # print(pix.pixel(1, 2)) # # 根据pdf的路径生成图片的名称 # # new_name = pdf_path.replace('\\', '_') + "_img{}.png".format(imgcount) # # new_name = new_name.replace(':', '') new_name = os.path.join(pic_path, f'{img_count}.png') # # print(new_name) # # new_name = "图片{}.png".format(imgcount) # 生成图片的名称 # if pix.n < 5: # 如果pix.n<5,可以直接存为PNG pix.writePNG(new_name) else: # 否则先转换CMYK pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(new_name) pix0 = None pix = None # 释放资源 print("提取了第{}张图片".format(img_count)) # t1 = time.perf_counter() # 图片完成时间 # print("运行时间:{}s".format(t1 - t0)) # img = Image.open(os.path.join(pic_path, new_name)) # # print(type(img)) # img = img.convert('RGBA') # pixdata = img.load() # for y in range(img.size[1]): # for x in range(img.size[0]): # if pixdata[x, y][0] > c1 and pixdata[x, y][1] > c1 and pixdata[x, y][2] > c1: # pixdata[x, y] = (255, 255, 255, 255) # else: # pixdata[x, y] = (c2, c2, c2, 255) # # pixdata[x, y] = (pixdata[x, y][0] - c2, pixdata[x, y][1] - c2, pixdata[x, y][2] - c2, 255) # # img.show() doc.close() t1 = time.perf_counter() # 图片完成时间 print("总共提取了{}张图片".format(img_count)) print("运行时间:{}s".format(t1 - t0))
# Command line: # python embedded-import.py some.pdf embed.file #------------------------------------------------------------------------------ parser = argparse.ArgumentParser(description="Enter PDF, file to embed, and optional name, description and output pdf.") parser.add_argument('pdf', help='PDF filename') parser.add_argument('file', help='name of embedded file') parser.add_argument('-n', "--name", help='name for embedded file entry (default: file)') parser.add_argument('-d', "--desc", help='description (default: file)') parser.add_argument('-o', "--output", help = 'output PDF (default: modify pdf)') args = parser.parse_args() delim = args.desc # requested CSV delimiter character pdffn = args.pdf impfn = args.file doc = fitz.open(pdffn) if not args.name: name = impfn desc = args.desc if not args.desc: desc = impfn # to be on the safe side, always open as binary content = open(impfn, "rb").read() # read all file content in # import the file into the PDF doc.embeddedFileAdd(content, name, impfn, desc) # save PDF (either incremental or to new PDF file) if not args.output: doc.saveIncr() else:
from __future__ import print_function import fitz import sys, os, subprocess, tempfile, time ''' Optimizes a PDF with FileOptimizer. But as "/Producer" and "/Creator" get spoiled by this, we first save metadata and restore it after optimization. This means we also accept non-compressed object definitions (as created by FileOptimizer). ''' assert len(sys.argv) == 2, "need filename parameter" fn = sys.argv[1] assert fn.lower().endswith(".pdf"), "must be a PDF file" fullname = os.path.abspath(fn) # get the full path & name t0 = time.clock() # save current time doc = fitz.open(fullname) # open PDF to save metadata meta = doc.metadata doc.close() t1 = time.clock() # save current time again subprocess.call(["fileoptimizer64", fullname]) # now invoke FileOptimizer t2 = time.clock() # save current time again cdir = os.path.split(fullname)[0] # split dir from filename fnout = tempfile.mkstemp(suffix = ".pdf", dir = cdir) # create temp pdf name doc = fitz.open(fullname) # open now optimized PDF doc.setMetadata(meta) # restore old metadata doc.save(fnout[1], garbage = 4) # save temp PDF with it, a little sub opt doc.close() # close it os.remove(fn) # remove super optimized file
from __future__ import print_function import fitz import argparse #-------------------------------------------------------------------- # use argparse to handle invocation arguments #-------------------------------------------------------------------- parser = argparse.ArgumentParser(description="Enter CSV delimiter [;] and documment filename") parser.add_argument('-d', help='CSV delimiter [;]', default = ';') parser.add_argument('doc', help='document filename') args = parser.parse_args() delim = args.d # requested CSV delimiter character fname = args.doc # input document filename doc = fitz.open(fname) toc = doc.getToC(simple = False) ext = fname[-3:].lower() fname1 = fname[:-4] + "-toc.csv" outf = open(fname1, "w") for t in toc: t4 = t[3] if ext == "pdf": if t4["kind"] == 1: p4 = str(t4["to"].y) else: p4 = "0" else: p4 = "0" rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4]) outf.writelines([rec, "\n"]) outf.close()
import fitz # import PyMuPDF doc = fitz.open("some.pdf") # or new: fitz.open(), followed by insertPage() page = doc[n] # choose some page rect = fitz.Rect(50, 100, 300, 400) # rectangle (left, top, right, bottom) in pixels text = """This text will only appear in the rectangle. Depending on width, new lines are generated as required.\n<- This forced line break will also appear.\tNow a very long word: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.\nIt will be broken into pieces.""" rc = page.insertTextbox(rect, text, fontsize = 12, # choose fontsize (float) fontname = "Times-Roman", # a PDF standard font fontfile = None, # could be a file on your system align = 0) # 0 = left, 1 = center, 2 = right print("unused rectangle height: %g" % rc) # just demo (should display "44.2") doc.saveIncr() # update file. Save to new instead by doc.save("new.pdf",...)
(2) Easily adapt the example to combine just 2 pages (like for a booklet) or make the output page dimension dependent on input, or whatever. (3) This should run very fast: needed less than 25 sec on a Python 3.6 64bit, Windows 10, AMD 4.0 GHz for the 1'310 pages of the Adobe manual. Without save-options "garbage" and "deflate" this goes below 4 seconds, but results in a bigger file. Dependencies ------------- PyMuPDF 1.12.1 or later ''' from __future__ import print_function import fitz, sys infile = sys.argv[1] src = fitz.open(infile) doc = fitz.open() # empty output PDF width, height = fitz.PaperSize("a4") # A4 portrait output page format r = fitz.Rect(0, 0, width, height) # define the 4 rectangles per page r1 = r * 0.5 # top left rect r2 = r1 + (r1.width, 0, r1.width, 0) # top right r3 = r1 + (0, r1.height, 0, r1.height) # bottom left r4 = fitz.Rect(r1.br, r.br) # bottom right # put them in a list r_tab = [r1, r2, r3, r4] # now copy input pages to output
# Работаем с ПДФ файлами """ 1. Выведем только первые 5 страниц""" import fitz # Фитз входит в состав библиотеки PyMuPDF spisok = list(range(5)) # Список с номерами первых 5 страниц docu = fitz.open("SCAN.pdf") docu.select(spisok) # Удаляются все, кроме 5 страниц docu.save("SCAN_NEW.pdf", garbage=3) docu.close() """ import fitz stroka_1="ABCD"; stroka_2="EFGH"; stroka_3="IJKL" new_docu=fitz.open() new_docu.insertPage(text=stroka_1, fontsize=11) new_docu.insertPage(text=stroka_2, fontsize=20) new_docu.insertPage(text=None, fontsize=20) new_docu.insertPage(text=stroka_3, fontsize=20) new_docu.save("NewFile.pdf", garbage=3) new_docu_2=fitz.open("NewFile.pdf") spisok2=list(range(new_docu_2.pageCount)) for page_number in spisok2: if not new_docu.getPageText(page_number): spisok2.remove(page_number) new_docu_2.select(spisok2) new_docu_2.save("NewFileResult.pdf", garbage=3) new_docu.close() new_docu_2.close() """
Created on Thu Jul 16 12:50:03 2020 @author: Austin.Schrader """ import fitz import os imglist = [r'C:\\Users\\austin.schrader\\Desktop\\My_Desktop_Documents\\Python_Tools\\reading_emails_parse_attachment\\Attachments\\MI Payment from ACH Report.png'] pdf_path = r'C:\Users\austin.schrader\Desktop\My_Desktop_Documents\Python_Tools\reading_emails_parse_attachment\Attachments\MI Payment from ACH Report.pdf' for root, dirs, files in os.walk(r'C:\\Users\\austin.schrader\\Desktop\\My_Desktop_Documents\\Python_Tools\\reading_emails_parse_attachment\\Attachments'): for f in files: if f.endswith(".png"): try: doc = fitz.open() # PDF with the pictures for f in imglist: img = fitz.open(f) # open pic as document rect = img[0].rect # pic dimension pdfbytes = img.convertToPDF() # make a PDF stream img.close() # no longer needed imgPDF = fitz.open("pdf", pdfbytes) # open stream as PDF page = doc.newPage(width = rect.width, # new page with ... height = rect.height) # pic dimension page.showPDFpage(rect, imgPDF, 0) # image fills the page doc.save(pdf_path) os.remove(os.path.join(root, f)) except: pass
filename = '~/Sync/literature/村上春树_世界尽头与冷酷仙境_2007.pdf' outname = 'world.pdf' lineH = 15.0 # chap_start = 7 # chap_end = 256 # intermediate locfile = 'haha.pdf' xmpdf = 'xm.pdf' xmcpdf = 'xmc.pdf' cmd = 'cp ' + filename + ' ' + locfile os.system(cmd) doc = fitz.open(locfile) # doc.delete_pages(chap_end, -1) # doc.delete_pages(0, chap_start - 2) # repeat every page two times for origin_pagenum in range(len(doc)): pagenum = origin_pagenum * 2 doc.fullcopy_page(pagenum, pagenum) doc.save(xmpdf) doc.close() # crop each page crop(["-b", "c", "-ap", "73", "-p", "5", "-v", xmpdf, "-o", xmcpdf])
Usage: ------ extract_img4.py input.file ''' from __future__ import print_function import fitz import hashlib import sys, time assert len(sys.argv) == 2, 'Usage: %s <input file>' % sys.argv[0] t0 = time.clock() if str is bytes else time.perf_counter() doc = fitz.open(sys.argv[1]) # the PDF imgcount = 0 # counts extracted images hash_list = [] # records images already extracted # display some file info print("file: %s, pages: %i" % (sys.argv[1], len(doc))) for page in doc: # cycle through the document's pages js = page.getText("dict") # get a page's content in dict format blocks = js["blocks"] # we are interested in the blocks j = 0 # counts images per page for b in blocks: if b["type"] != 1: # not an image block continue fname = "p%i-%i." % (page.number, j) # file names look like so
)[SOURCE_EXCEL_START_ROW]: if row[0].value.strip() != '': name_list.append(row[config_module.get_config_obj() [SOURCE_EXCEL_FILE_NAME_COLUMN]].value) row_index = row_index + 1 return name_list print('正在初始化程序数据...') global_data_module.init() print('正在解析配置文件并读取Excel...') # 读取Excel file_names = read_asin_data(config_module.get_config_obj()[SOURCE_EXCEL]) print('正在解码PDF...') # 读取PDF pdf_doc = fitz.open(config_module.get_config_obj()[SOURCE_PDF]) print('PDF页数:' + str(pdf_doc.pageCount) + ', EXCEL 记录数:' + str(len(file_names))) if pdf_doc.pageCount != len(file_names): exit_module.tip_and_wait_then_exit('Excel的记录数量与PDF页数不匹配,无法继续操作') mat = fitz.Matrix(config_module.get_config_obj()[IMAGE_SCALE], config_module.get_config_obj()[IMAGE_SCALE]) if not os.path.exists(config_module.get_config_obj()[OUTPUT_DIR]): os.makedirs(config_module.get_config_obj()[OUTPUT_DIR]) for page_index in range(pdf_doc.pageCount): pdf_page = pdf_doc[page_index] print('开始转换写出第 ' + str(page_index + 1) + ' / ' + str(len(file_names)) + ' 页图片数据') pdf_page.getPixmap( matrix=mat).writePNG(config_module.get_config_obj()[OUTPUT_DIR] + os.sep + file_names[page_index] + '.png')
For XPS and EPUB input, internal links however **are** of type "LINK_NAMED". Base library MuPDF does not resolve them to page numbers. So, for anyone expert enough to know the internal structure of these document types, can further interpret and resolve these link types. Dependencies -------------- PyMuPDF v1.13.3 """ import fitz import sys if not (list(map(int, fitz.VersionBind.split("."))) >= [1,13,3]): raise SystemExit("insufficient PyMuPDF version") fn = sys.argv[1] doc = fitz.open(fn) if doc.isPDF: raise SystemExit("document is PDF already") print("Converting '%s' to '%s.pdf'" % (fn, fn)) b = doc.convertToPDF() # convert to pdf pdf = fitz.open("pdf", b) # open as pdf toc= doc.getToC() # table of contents of input pdf.setToC(toc) # simply set it for output meta = doc.metadata # read and set metadata if not meta["producer"]: meta["producer"] = "PyMuPDF v" + fitz.VersionBind if not meta["creator"]: meta["creator"] = "PyMuPDF PDF converter"
from __future__ import print_function import fitz import argparse #-------------------------------------------------------------------- # use argparse to handle invocation arguments #-------------------------------------------------------------------- parser = argparse.ArgumentParser( description="Enter CSV delimiter [;] and documment filename") parser.add_argument('-d', help='CSV delimiter [;]', default=';') parser.add_argument('doc', help='document filename') args = parser.parse_args() delim = args.d # requested CSV delimiter character fname = args.doc # input document filename doc = fitz.open(fname) toc = doc.getToC(simple=False) ext = fname[-3:].lower() fname1 = fname[:-4] + "-toc.csv" outf = open(fname1, "w") for t in toc: t4 = t[3] if ext == "pdf": if t4["kind"] == 1: p4 = str(t4["to"].y) else: p4 = "" else: p4 = "" rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4]) outf.writelines([rec, "\n"]) outf.close()
import calendar import sys assert len(sys.argv) == 2, "need start year as the one and only parameter" startyear = sys.argv[1] assert startyear.isdigit(), "year must be positive numeric" startyear = int(startyear) assert startyear > 0, "year must be positive numeric" # We use a nicer mono-spaced font than the PDF builtin 'Courier'. # If you do not know one, set ffile to None and fname to 'Courier' ffile = "c:/windows/fonts/dejavusansmono.ttf" fname = "F0" doc = fitz.open() cal = calendar.LocaleTextCalendar(locale = "es") # use your locale #cal = calendar.TextCalendar() # or stick with English w, h = fitz.PaperSize("a4-l") # get sizes for A4 landscape paper txt = cal.formatyear(startyear, m = 4) doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile, width = w, height = h) txt = cal.formatyear(startyear + 1, m = 4) doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile, width = w, height = h) txt = cal.formatyear(startyear + 2, m = 4) doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile,
import os import fitz from mamba import description, it, before from fitzutils import ToCEntry from pdftocio.tocio import read_toc, write_toc dirpath = os.path.dirname(os.path.abspath(__file__)) level2 = os.path.join(dirpath, "files/level2.pdf") hastoc = os.path.join(dirpath, "files/hastoc.pdf") with description("read_toc") as self: with before.all: self.doc = fitz.open(level2) self.reference = fitz.open(hastoc) self.expect = [ ToCEntry(level=1, title='Section One', pagenum=1), ToCEntry(level=1, title='Section Two', pagenum=1), ToCEntry(level=2, title='Subsection Two.One', pagenum=2), ToCEntry(level=1, title='Section Three, with looong loooong looong title', pagenum=3), ToCEntry( level=2, title='Subsection Three.One, ' 'with even loooooooooooonger title, and probably even more', pagenum=3), ToCEntry(level=2, title='Subsection Three.Two', pagenum=4), ToCEntry(level=2, title='Subsection Three.Three', pagenum=5), ToCEntry(level=1, title='The End', pagenum=5)
from __future__ import print_function import sys import fitz #------------------------------------------------------------------------------ # Example program # License: GNU GPL V3 # Extracts an embedded file from an existing PDF # Command line: # python embedded-export.py input.pdf name export.file #------------------------------------------------------------------------------ pdffn = sys.argv[1] # PDF file name name = sys.argv[2] # embedded file identifier expfn = sys.argv[3] # filename of exported file doc = fitz.open(pdffn) # open PDF outfile = open(expfn, "wb") # to be on the safe side always open binary # extract file content. Will get exception on any error. content = doc.embeddedFileGet(name) outfile.write(content) outfile.close()
def __init__(self, url, config_dir, background_color): super(PdfViewerWidget, self).__init__() self.url = url self.config_dir = config_dir self.background_color = background_color self.installEventFilter(self) self.setMouseTracking(True) # Load document first. self.document = fitz.open(url) # Get document's page information. self.first_pixmap = self.document.getPagePixmap(0) self.page_width = self.first_pixmap.width self.page_height = self.first_pixmap.height self.page_total_number = self.document.pageCount # Init scale and scale mode. self.scale = 1.0 self.read_mode = "fit_to_width" # Inverted mode. self.inverted_mode = False # mark link self.is_mark_link = False self.mark_link_annot_cache_dict = {} #jump link self.jump_link_key_cache_dict = {} self.jump_link_annot_cache_dict = {} #global search text self.is_mark_search = False self.search_text_offset_list = [] self.search_text_annot_cache_dict = {} # select text self.is_select_mode = False self.start_char_rect_index = None self.start_char_page_index = None self.last_char_rect_index = None self.last_char_page_index = None self.select_area_annot_cache_dict = {} self.char_dict = {k: None for k in range(self.page_total_number)} # Init scroll attributes. self.scroll_step = 20 self.scroll_offset = 0 self.mouse_scroll_offset = 20 # Padding between pages. self.page_padding = 10 # Init font. self.page_annotate_height = 22 self.page_annotate_padding_right = 10 self.page_annotate_padding_bottom = 10 self.page_annotate_light_color = QColor("#333333") self.page_annotate_dark_color = QColor("#999999") self.font = QFont() self.font.setPointSize(12) # Page cache. self.page_cache_pixmap_dict = {} self.page_cache_scale = self.scale self.page_cache_trans = None self.page_cache_context_delay = 1000 self.last_action_time = 0 self.is_page_just_changed = False self.remember_offset = None # Save table in file for search framework, such as snails, search table to navigate. table_info = "" for info in self.document.getToC(): indentation_num = info[0] title = info[1] page = info[2] table_info += str(page) + self.repeat_to_length( " ", indentation_num * 4) + title + "\n" table_file_hash = hashlib.md5(self.url.encode()).hexdigest() self.table_file_path = os.path.join(config_dir, "pdf-viewer", "table", table_file_hash) touch(self.table_file_path) with open(self.table_file_path, "w") as f: f.write(table_info)