Exemplo n.º 1
1
def repair_pdf(origname, newname):
    """
    Attempt to repair a PDF file.
    """
    try:
        ifile = open(origname, "rb")
    except FileNotFoundError:
        return "File Not Found" # File not found

    idata = ifile.read()                    # put in memory
    ifile.close()
    ibuffer = BytesIO(idata)                # convert to stream
    try:
        data = PdfReader(ibuffer)
        return "" # File did not need to be repaired
    except:                                 # problem! heal it with PyMuPDF
        #print ("Error reading")
        doc = fitz.open("pdf", idata)       # open and save a corrected
        try:
            fixed = doc.write(garbage=3, deflate=1, clean=1) # version in memory
            doc.close()
            doc = idata = None                  # free storage
            ibuffer = BytesIO(fixed)                # convert to stream
            PdfWriter(newname, trailer=PdfReader(ibuffer)).write()
            return True # File has been Fixed
        except ValueError:
            return False
Exemplo n.º 2
0
    def __init__(self, parent, pdf_file):
        """
        :param `pdf_file`: a File object or an object that supports the standard
        read and seek methods similar to a File object.
        Could also be a string representing a path to a PDF file.
        """
        self.parent = parent
        if isinstance(pdf_file, string_types):
            # a filename/path string, pass the name to fitz.open
            pathname = pdf_file
            self.pdfdoc = fitz.open(pathname)
        else:
            # assume it is a file-like object, pass the stream content to fitz.open
            # and a '.pdf' extension in pathname to identify the stream type
            pathname = 'fileobject.pdf'
            if pdf_file.tell() > 0:     # not positioned at start
                pdf_file.seek(0)
            stream = bytearray(pdf_file.read())
            self.pdfdoc = fitz.open(pathname, stream)

        self.numpages = self.pdfdoc.pageCount
        page = self.pdfdoc.loadPage(0)
        self.pagewidth = page.bound().width
        self.pageheight = page.bound().height
        self.page_rect = page.bound()
        self.zoom_error = False     #set if memory errors during render
Exemplo n.º 3
0
 def document(self):
     filename, fobj = get_filename_and_fobj(self.filename_or_fobj, mode="rb")
     if not filename:
         data = fobj.read()  # TODO: may use a lot of memory
         doc = pymupdf.open(stream=data, filetype="pdf")
     else:
         doc = pymupdf.open(filename=filename, filetype="pdf")
     return doc
Exemplo n.º 4
0
def return_image_obj(fs_path, memory=False):
    """
    Given a Fully Qualified FileName/Pathname, open the image
    (or PDF) and return the PILLOW object for the image
    Fitz == py


    Args:
        fs_path (str) - File system path
        memory (bool) - Is this to be mapped in memory

    Returns:
        boolean::
            `True` if uuid_to_test is a valid UUID, otherwise `False`.

    Raises:
        obj::
            Pillow image object

    Examples
    --------
    """
    source_image = None
    if os.path.splitext(fs_path)[1][1:].lower() == u"pdf":
        results = pdf_utilities.check_pdf(fs_path)
        if results[0] == False:
            pdf_utilities.repair_pdf(fs_path, fs_path)

        pdf_file = fitz.open(fs_path)
        pdf_page = pdf_file.loadPage(0)
        pix = pdf_page.getPixmap(matrix=fitz.Identity,
                                 alpha=True)

        try:
            source_image = Image.open(BytesIO(pix.getPNGData()))
        except UserWarning:
            print ("UserWarning!")
            source_image = None
    else:
        if not memory:
            source_image = Image.open(fs_path)
        else:
            try:# fs_path is a byte stream
                source_image = Image.open(BytesIO(fs_path))
            except IOError:
                print("IOError")
                log.debug("PIL was unable to identify as an image file")
            except UserWarning:
                print ("UserWarning!")
                source_image = None
#        if source_image.mode != "RGB":
#            source_image = source_image.convert('RGB')
    return source_image
Exemplo n.º 5
0
def getPDFinfo():
    spad.doc = fitz.open(spad.file)
    if spad.doc.needsPass:
        decrypt_doc()
        if spad.doc.isEncrypted:
            return True
    spad.seiten = spad.doc.pageCount
    spad.meta = {"author": "", "title": "", "subject": ""}

    for key, wert in spad.doc.metadata.items():
        if wert:
            if pyversion < 3:
                spad.meta[key] = wert.decode("utf-8", "ignore")
            else:
                spad.meta[key] = wert
        else:
            spad.meta[key] = ""

    spad.fromjson = False
    spad.inhalt = spad.doc.getToC(simple=False)
    tocfile = spad.file + ".json"
    if os.path.exists(tocfile):
        d = wx.MessageDialog(
            None,
            "Saved data exist for this PDF - Use them instead?",
            "Input available from previous edit session",
            wx.YES_NO | wx.ICON_QUESTION,
        )
        rc = d.ShowModal()
        d.Destroy()
        d = None
        if rc == wx.ID_YES:
            try:
                f_toc = open(tocfile)
                d = json.load(f_toc)
                f_toc.close()
                spad.fromjson = True
                spad.inhalt = d["toc"]
                spad.meta["author"] = d["author"]
                spad.meta["title"] = d["title"]
                spad.meta["subject"] = d["subject"]
                spad.meta["keywords"] = d["keywords"]
            except:
                d = wx.MessageDialog(None, "Ignoring saved data", "Invalid input from previous session")
                d.ShowModal()
                d.Destroy()
                d = None
                pass
        else:
            os.remove(tocfile)
    return False
def procesarPDF(nombreArchivoEntrada, listaMaterias, fdSalida):
    doc = fitz.open(nombreArchivoEntrada)
    # Crear un lector XML
    parser = xml.sax.make_parser()
    # Desactivar namespaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    Handler = OfertasGeneral(listaMaterias)
    # override the default ContextHandler
    parser.setContentHandler( Handler )

    for num in range(0,doc.pageCount):
        # Procesar los PDFs usando MuPDF. Se extrae el texto del documento en
        # archivo XML.
        page = doc.loadPage(num)
        # Crear un archivo temporal
        try:
            f = open('textPDFXML1.xml', 'w')
            f.write(page.getText(output = "xml"))
        except OSError as ose:
            print("Error de E/S: ", ose)
        else:
            # Procesar el archivo XML
            parser.parse("textPDFXML1.xml")

    f.close()
    remove('textPDFXML1.xml')

    # Concatenar en un solo string e imprimir filas en formato CSV.
    for fil in Handler.subdividirFilas():
        if fil:
            if len(fil) > 1:
                if isinstance(fil[1],tuple):
                    acum = fil[0] + ',A'
                    horariosOrdenados = sorted(fil[1:], key=ordenarDias)
                else:
                    acum = ",".join(fil[:2])
                    horariosOrdenados = sorted(fil[2:], key=ordenarDias)

                if horariosOrdenados:
                    acum += componerHorarioCSV(horariosOrdenados) + ','
                else:
                    acum += ',,,,,,Y'

            elif len(fil) == 1:
                acum = fil[0] + ',A,,,,,,Y'

            fdSalida.append(acum.split(','))
            acum = ""
Exemplo n.º 7
0
def getPDFinfo():
    PDFcfg.doc = fitz.open(PDFcfg.file)
    if PDFcfg.doc.needsPass:
        decrypt_doc()
    if PDFcfg.doc.isEncrypted:
        return True
    PDFcfg.inhalt = PDFcfg.doc.getToC()
    PDFcfg.seiten = PDFcfg.doc.pageCount
    PDFmeta = {"author":"", "title":"", "subject":""}
    for key, wert in PDFcfg.doc.metadata.items():
        if wert:
            PDFmeta[key] = wert.decode("utf-8", "ignore")
        else:
            PDFmeta[key] = ""
    PDFcfg.meta = PDFmeta
    return False
Exemplo n.º 8
0
    def __init__(self, url, background_color):
        super(PdfViewerWidget, self).__init__()

        self.url = url
        self.background_color = background_color

        # Load document first.
        self.document = fitz.open(url)

        # Get document's page information.
        self.first_pixmap = self.document.getPagePixmap(0)
        self.page_width = self.first_pixmap.width
        self.page_height = self.first_pixmap.height
        self.page_total_number = self.document.pageCount

        # Init scale and scale mode.
        self.scale = 1.0
        self.read_mode = "fit_to_width"

        # Init scroll attributes.
        self.scroll_step = 20
        self.scroll_offset = 0
        self.mouse_scroll_offset = 20

        # Padding between pages.
        self.page_padding = 10

        # Init font.
        self.page_annotate_height = 22
        self.page_annotate_padding_right = 10
        self.page_annotate_padding_bottom = 10
        self.page_annotate_color = QColor("#333333")
        self.font = QFont()
        self.font.setPointSize(12)

        # Page cache.
        self.page_cache_pixmap_dict = {}
        self.page_cache_scale = self.scale
        self.page_cache_trans = None
        self.page_cache_context_delay = 1000

        self.last_action_time = 0

        self.is_page_just_changed = False

        self.remember_offset = None
def return_image_obj(fs_path):
    fext = os.path.splitxt(fs_path)[1][1:].upper()
    if fext == "PDF":
        pdf_file = fitz.open(fs_path)
        pdf_page = pdf_file.loadPage(0)
        pix = pdf_page.getPixmap(matrix=fitz.Identify, 
                         colorspace="rgb", 
                         alpha=True)
        source_image = Image.open(BytesIO(pix.getPNGData()))
    else:
        source_image = Image.open(fs_path)
        
    if source_image.mode != "RGB":
        source_image = source_image.convert('RGB')
    return source_image

    
Exemplo n.º 10
0
def check_pdf(filename):
    """
    Use the PyMuPDF library to verify the structure of a PDF file.

    :param filename: The FQPN filename of the file in question to check
    :type filename: String

    :return: A Tuppple that contains
      * Boolean - Is Clean (True if no issue, False if issue)
      * Generic error message, eg. expected generation number
      * Raw Error message, eg expected generation number (25366 ? obj)
    :rtype: Tupple

    Generic Error message is filtered, to try to remove changing data, so
     that it can be used in the filtered excel report.

    .. code-block:: python

        >>> check_pdf(r"test_samples\\badpdf\\Administrative - 30 - Consent to Treat 02-16-07 - 7712.pdf")
        (False, 'expected generation number', 'expected generation number (25366 ? obj)')
        >>> check_pdf(r"test_samples\\badpdf\\Administrative - 30 - PayPol 05-27-08 - 7713.pdf")
        (False, 'expected generation number', 'expected generation number (17469 ? obj)')
        >>> check_pdf(r"test_samples\\goodpdf\\CCD_extract_101001-00.html.pdf")
        (True, '', '')
        >>> check_pdf(r"test_samples\\goodpdf\\CCD_extract_101002-00.html.pdf")
        (True, '', '')
    """
    errmsg = ""
    try:
        pdffile = fitz.open(filename)
        raw_errmsg = pdffile.openErrMsg
        errorcode = pdffile.openErrCode
    except RuntimeError:
        #
        #   A truly fatal error occurred, trap, assuming it's file not found.
        #   (Need to verify FNF is the only condition this applies to.)
        raw_errmsg = "File Not Found"
        errorcode = -1

    if raw_errmsg != "":    # There is an error
        if "(" in raw_errmsg:   # Does it have an (?
            errmsg = raw_errmsg[0:raw_errmsg.find("(")].strip()
        else:
            errmsg = raw_errmsg
    return (errorcode == 0, errmsg, raw_errmsg)
Exemplo n.º 11
0
    def scrape_urls(self):
        """Read the PDF, remove newlines, then replace 'http' with
        '\nhttp', then split by newline.  Now we can walk over each
        element and run our url extraction method on each line.
        """
        if self.local_file:
            pdf = fitz.open(self.local_file)

            for page in pdf:
                text_raw = page.getText()
                text_no_newlines = text_raw.replace('\n', '')
                text_with_newlines = text_no_newlines.replace('http', '\nhttp')
                lines = text_with_newlines.split('\n')

                for line in lines:
                    if line.startswith('http'):
                        url = self.extract_url_from_line(line)
                        if url not in self.urls:
                            self.urls.append(url)
Exemplo n.º 12
0
    def __init__(self, filepath, main_window, parent=None):
        super(PliantQGraphicsView, self).__init__(parent)
        self._translate = QtCore.QCoreApplication.translate
        self.parent = parent
        self.main_window = main_window

        self.image_pixmap = None
        self.image_cache = [None for _ in range(4)]

        self.thread = None

        self.annotation_dict = self.parent.metadata['annotations']

        self.filepath = filepath
        self.filetype = os.path.splitext(self.filepath)[1][1:]

        if self.filetype == 'cbz':
            self.book = zipfile.ZipFile(self.filepath)

        elif self.filetype == 'cbr':
            self.book = rarfile.RarFile(self.filepath)

        elif self.filetype == 'pdf':
            self.book = fitz.open(self.filepath)

        self.common_functions = PliantWidgetsCommonFunctions(
            self, self.main_window)

        self.ignore_wheel_event = False
        self.ignore_wheel_event_number = 0
        self.setMouseTracking(True)
        self.setDragMode(QtWidgets.QGraphicsView.ScrollHandDrag)

        self.setContextMenuPolicy(QtCore.Qt.CustomContextMenu)
        self.customContextMenuRequested.connect(
            self.generate_graphicsview_context_menu)
Exemplo n.º 13
0
 def read_book(self):
     self.book = fitz.open(self.filename)
Exemplo n.º 14
0
def OneJpg2OnePdf(jpg_path, pdf_path, pdf_name):
    imgdoc = fitz.open(jpg_path)
    img_byte = imgdoc.convetToPdf()
    img_pdf = fitz.open("pdf", img_byte)
    img_pdf.save(pdf_path + '\\{}.pdf'.format(pdf_name))
    img_pdf.close()
Exemplo n.º 15
0
    updated_end_index = end_index + pre_index
    mod_para_content = para_content[updated_start_index:updated_end_index]
    super_string = mod_para_content
    page = fitz_doc_obj[page_number - 1]
    super_area = page.searchFor(super_string)
    text_instances = page.searchFor(elem)
    for inst in text_instances:
        if bb_intersection_over_union(inst, super_area):
            specific_word_coordinates_list = [
                inst[0], inst[1], inst[2], inst[3]
            ]
            break
    return specific_word_coordinates_list


if __name__ == '__main__':
    pdf_path = "/Users/nr012/Downloads/2015-3.pdf"
    fitz_doc_obj = fitz.open(pdf_path)
    page_no = 1
    start_index, end_index = 51, 60
    para_content = "The following important factors, and other factors described elsewhere in this Report or contained in our other filings with the U.S. Securities and Exchange Commission (SEC), among others, could cause our results to differ materially from any results described in any forward-looking statements:."
    # start_index, end_index = 16, 28 #successfully in 1Page.pdf
    # para_content = "We believe that successfully meeting these objectives will generate financial performance exceeding that of our peers and result in full and fair valuation of our common shares."
    elem = para_content[start_index:end_index]
    specific_word_coordinates_list = []
    if end_index == 0:
        specific_word_coordinates_list = []
    else:
        specific_word_coordinates_list = get_specific_word_coords(
            para_content, page_no, elem, fitz_doc_obj, start_index, end_index)
    print(specific_word_coordinates_list)
Exemplo n.º 16
0
import fitz
doc = fitz.open("calvin2.pdf")
for i in range(len(doc)):
    for img in doc.getPageImageList(i):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        if pix.n < 5:       # this is GRAY or RGB
            pix.writePNG("p%s-%s.png" % (i, xref))
        else:               # CMYK: convert to RGB first
            pix1 = fitz.Pixmap(fitz.csRGB, pix)
            pix1.writePNG("p%s-%s.png" % (i, xref))
            pix1 = None
        pix = None
Exemplo n.º 17
0
    F1 = ' , '.join(map(str, F2))

    return A1, P1, R1, F1


########################################### EXTRACCION PDF #################################3

carp = "public"
pdf = sys.argv[1]
pdf = carp + pdf
df = read_pdf(pdf, pages="2")
tabula.convert_into(pdf, ("frmXYelim-291272" + '.csv'),
                    output_format="csv",
                    pages="2")
pdf_documento = pdf
documento = fitz.open(pdf_documento)
pagina = documento.loadPage(1)
text = pagina.getText("text")
doc = (re.sub('[!?@#$()-.,;:*/0-9%"]+', ' ', text.lower())).split()

################################### WHILE PARA OBTNER EL TEXTO DESEADO ############
c = 0
m = 0
hecho = []
demanda = ""

while (c < len(doc)):
    if (doc[c] == "hecho"):
        c = c + 1
        while (c < len(doc)):
            if (doc[c] == "sumario"):
Exemplo n.º 18
0
def make_pdf(dlg):
    # no file selected: treat like "QUIT"
    if not len(dlg.szr02.Table.data):       # no files there - quit
        return None
    # create time zone value in PDF format
    cdate = fitz.getPDFnow()
    ausgabe = dlg.btn_aus.GetPath()
    pdf_out = fitz.open()              # empty new PDF document
    aus_nr = 0                         # current page number in output
    pdf_dict = {"creator": "PDF Joiner",
                "producer": "PyMuPDF",
                "creationDate": cdate,
                "modDate": cdate,
                "title": dlg.austit.Value,
                "author": dlg.ausaut.Value,
                "subject": dlg.aussub.Value,
                "keywords": dlg.keywords.Value}
    pdf_out.setMetadata(pdf_dict)      # put in meta data
    total_toc = []                     # initialize TOC
#==============================================================================
# process one input file
#==============================================================================
    for zeile in dlg.szr02.Table.data:
        dateiname = zeile[0]
        doc = fitz.open(dateiname)
        max_seiten = len(doc)
#==============================================================================
# user input minus 1, PDF pages count from zero
# also correct any inconsistent input
#==============================================================================
        von = int(zeile[2]) - 1             # first PDF page number
        bis = int(zeile[3]) - 1             # last PDF page number

        von = min(max(0, von), max_seiten - 1)   # "from" must be in range
        bis = min(max(0, bis), max_seiten - 1)   # "to" must be in range
        rot = int(zeile[4])                 # get rotation angle
        # now copy the page range
        pdf_out.insertPDF(doc, from_page = von, to_page = bis,
                          rotate = rot)
        if dlg.noToC.Value:                 # no ToC wanted - get next file
            continue

        incr = 1                            # standard increment for page range
        if bis < von:
            incr = -1                       # increment for reversed sequence
        # list of page numbers in range
        pno_range = list(range(von, bis + incr, incr))
        # standard bokkmark title = "infile [pp from-to of max.pages]"
        bm_main_title = "%s [pp. %s-%s of %s]" % \
              (os.path.basename(dateiname[:-4]), von + 1,
               bis + 1, max_seiten)
        # insert standard bookmark ahead of any page range
        total_toc.append([1, bm_main_title, aus_nr + 1])
        toc = doc.getToC(simple = False)    # get file's TOC
        last_lvl = 1                        # immunize against hierarchy gaps
        for t in toc:
            lnk_type = t[3]["kind"]         # if "goto", page must be in range
            if (t[2] - 1) not in pno_range and lnk_type == fitz.LINK_GOTO:
                continue
            if lnk_type == fitz.LINK_GOTO:
                pno = pno_range.index(t[2] - 1) + aus_nr + 1
            # repair hierarchy gaps by filler bookmarks
            while (t[0] > last_lvl + 1):
                total_toc.append([last_lvl + 1, "<>", pno, t[3]])
                last_lvl += 1
            last_lvl = t[0]
            t[2] = pno
            total_toc.append(t)

        aus_nr += len(pno_range)       # increase output counter
        doc.close()
        doc = None

#==============================================================================
# all input files processed
#==============================================================================
    if total_toc:
        pdf_out.setToC(total_toc)
    pdf_out.save(ausgabe)
    pdf_out.close()
    return ausgabe
Exemplo n.º 19
0
Main purpose of this function is to demonstrate that working with PyMuPDF
is easy and straightforward ...
What does introduce some complexity is the ability to scale, and to left-right
flip the image while maintaining the text legible.
-------------------------------------------------------------------------------
New (2017-09-21):
-----------------
Scaling and other morphing effects can now also be achieved with a morphing
matrix. This is possible after page method "insertTextbox" also supports this.
-------------------------------------------------------------------------------
"""
#==============================================================================
# invoke the pencil function
#==============================================================================
if __name__ == "__main__":
    doc=fitz.open()                         # empty new PDF
    page = doc.newPage()                    # create page (A4)
    img  = page.newShape()                  # create shape
# =============================================================================
#   pencil 1
# =============================================================================
    penheight = 100                         # thickness of pencil
    pentip = fitz.Point(100, 150)           # first pencil tip here
    pencil(img, pentip, penheight, True)    # pencil points left
# =============================================================================
#   pencil 2    
# =============================================================================
    penheight = 20                          # now a smaller one
    pentip = fitz.Point(100, 250)           # new pencil tip
    pencil(img, pentip, penheight, False)   # this one points right
    
Exemplo n.º 20
0
for root, dirs, files in os.walk(IN_DIR, topdown=False):
    for name in files:
        all_pdf_files.append(join(root, name))

with open("out.csv", "w") as file:

    file.write(
        f"\"Číslo pracovnej cesty\"; \"Meno a priezvisko\"; \"EVČ\"; \"Suma\"\n"
    )

    for pdf_file in all_pdf_files:

        print(f"File: {pdf_file}")

        with fitz.open(pdf_file) as doc:
            text = ""
            for page in doc:
                text += page.getText()

        print(text)

        # ID
        x = re.findall("\nVyúčtovanie pracovnej cesty č. [0-9]+\n", text)
        print(x)
        cp_id = x[0].split(" ")[-1].strip()
        print(f"ID: {cp_id}")
        file.write(f"\"{cp_id}\";")

        # Meno
        x = re.findall("\nPriezvisko, meno, titul:\n.+\n", text)
Exemplo n.º 21
0
"""
Demo / Experimental: Replace the fonts in a PDF.

"""
import fitz
import sys

fname = sys.argv[1]

doc = fitz.open(fname)  # input PDF
out = fitz.open()  # output PDF
csv = open("fonts.csv").read().splitlines()
all_fonts = []  # will contain: (old basefont name, Base14 name)
for f in csv:
    all_fonts.append(f.split(";"))


def pdf_color(srgb):
    """Create a PDF color triple from a given sRGB color integer.
    """
    b = (srgb % 256) / 255
    srgb /= 256
    g = (srgb % 256) / 255
    srgb /= 256
    r = srgb / 255
    return (r, g, b)


def get_font(fontname):
    """Lookup base fontname and return one of the "reserved" Base14 fontnames.
    """
def pdf2pic(path, pic_path):
    '''
	'''
    t0 = time.perf_counter()

    checkXO = r'/Type(?= */XObject)'
    checkIM = r'/Subtype(?= */Image)'

    doc = fitz.open(path)
    imgcount = 0
    total_img_cnt = 0

    # help(doc)

    lenXREF = doc._getXrefLength()

    print(f'path:({path}) pages:({len(doc)}) object:({lenXREF-1})')

    for i in range(1, lenXREF):
        text = doc._getXrefString(i)
        isXObject = re.search(checkXO, text)
        isImage = re.search(checkIM, text)

        # print(f'[{i}]:text:({text})')

        if not isXObject or not isImage:
            continue

        print(f'[{i}]:--------------')
        print(f'[{i}]:text:({text})')

        total_img_cnt += 1

        pix = fitz.Pixmap(doc, i)
        print(f'[{i}]:pix:({pix})')
        print(f'[{i}]:pix.colorspace:({pix.colorspace})')

        # help(pix)

        cs = pix.colorspace

        print(f'[{i}]:cs:name({cs.name})) value({cs.n}) pix.n({pix.n})')

        if cs.n == 1:  # csGRAY
            print(f'[{i}]:ignore gray image.')
        elif cs.n == 2:  # unknown
            print(f'[{i}]:unknown colorspace.({cs})')
        elif cs.n == 3:  # csRGB
            imgcount += 1
            new_path = new_img_path(path, pic_path, imgcount)
            pix.writePNG(new_path)
        elif cs.n == 4:  # csCMYK
            imgcount += 1
            new_path = new_img_path(path, pic_path, imgcount)
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(new_path)
            pix0 = None
        else:
            print(f'[{i}]:error.unknown colorspace({cs})')

        pix = None

    t1 = time.perf_counter()

    print(f'found ({total_img_cnt}) images,({imgcount}) exported.')
    print(f'done.needs ({t1-t0}) secs')
Exemplo n.º 23
0
def valid(upload_pics='.'):
    def get_content_ocr(ocrObject):
        strs = ''
        try:
            for item in ocrObject['words_result']:
                strs += item['words'] + '\n'
            return strs
        except:
            return ''

    pdf_path = '2020劳动合同范文.pdf.pdf'
    doc = fitz.open(pdf_path)
    # x = doc[0].getImageList()
    x = doc[0].getPixmap()
    valid_pdf_img = [item.getPixmap() for item in doc]
    doc.close()
    # src_pdf_text = [get_content_ocr(client.basicAccurate(item.getImageData(output='png'))) for item in valid_pdf_img]
    # 每页原pdf的识别文字
    # src_pdf_text = ['\n'.join([text['words'] for text in client.basicAccurate(item.getImageData(output='png'))['words_result']]) for item in valid_pdf_img]
    src_pdf_text = [
        get_content_ocr(client.basicAccurate(item.getImageData(output='png')))
        for item in valid_pdf_img
    ]

    upload_img = []
    for i in os.listdir('./pics'):
        with open('pics/' + i, 'rb') as f:
            upload_img.append(f.read())
    upload_text = [
        get_content_ocr(client.basicAccurate(item)) for item in upload_img
    ]

    # 排序
    upload_text_sort = []
    for src_text in src_pdf_text:
        similaritys = []
        for usr_text in upload_text:
            similarity = get_equal_rate(src_text, usr_text)
            similaritys.append(similarity)
        index = [
            i for (i, item) in enumerate(similaritys)
            if item == max(similaritys)
        ][0]
        upload_text_sort.append(upload_text[index])

    compare_html = os.path.join('.', 'diff.html')

    compare_sum = 0
    print(len(src_pdf_text))
    # src_pdf_text, upload_text_sort = src_pdf_text[0].split('\n'),upload_text_sort[0].split('\n')
    for src_txt, up_txt in zip(src_pdf_text, upload_text_sort):
        print('#' * 35)
        print('src_txt:', src_txt)
        src_remove_char, up_remove_char = compares(src_txt), compares(up_txt)
        similarity_remove = get_equal_rate(src_remove_char, up_remove_char)
        compare_sum += similarity_remove
        diff = difflib.HtmlDiff()
        result = diff.make_file(src_txt.split('\n'), up_txt.split('\n'))
        try:
            if similarity_remove < 1:
                fd_diff = open(os.path.join('.', 'diff.html'),
                               "a",
                               encoding='utf-8')
                fd_diff.write(result)
                fd_diff.close()
            else:
                pass
        except Exception as e:
            import traceback
            traceback.print_exc()
Exemplo n.º 24
0
rc = False
if str is bytes:
    imgdir = sys.argv[1]               # where my files are
else:
    rc, imgdir = psg.GetPathBox("Make a PDF from Attached Files",
                                "Enter file directory:")

if not imgdir:
    raise SystemExit()

t0 = mytime()                          # set start timer

width, height = fitz.PaperSize("a6-l") # get paper format

doc = fitz.open()                      # open empty PDF
page = doc.newPage(width = width,      # make new page
                   height = height)

# define sub rect to receive text and annotation symbols
rect = fitz.Rect(0, 0, width, height) + (36, 36, -36, -36)

imglist = os.listdir(imgdir)           # directory listing
imgcount = len(imglist)                # number of files

# calculate number of pages we will create
per_page = ((width - 72) // 25) * ((height - 36 - 56) // 35)
pages = int(round(imgcount / per_page + 0.5))

# header text
text = "Contains the following %i files from '%s':\n\n" % (imgcount, imgdir)
Exemplo n.º 25
0
highlight = "this text is highlighted"
underline = "this text is underlined"
strikeout = "this text is striked out"
squiggled = "this text is zigzag-underlined"
red = (1, 0, 0)
blue = (0, 0, 1)
gold = (1, 1, 0)
green = (0, 1, 0)

displ = fitz.Rect(0, 50, 0, 50)
r = fitz.Rect(72, 72, 220, 100)
t1 = u"têxt üsès Lätiñ charß,\nEUR: €, mu: µ, super scripts: ²³!"

font = fitz.Font("helv")  # used by the TextWriter class

doc = fitz.open()
page = doc.newPage()

page.setRotation(0)

# following makes sure that TextWriter references the **unrotated** page rect
# as everything else does ...
page_rect = page.rect * page.derotationMatrix


def print_descr(annot):
    """Print a short description to the right of the annot rect."""
    rect = annot.rect
    page = annot.parent
    writer = fitz.TextWriter(page_rect, color=red)
    writer.append(rect.br + (10, -5),
Exemplo n.º 26
0
    def __init__(self, parent, filename):
        defPos = wx.DefaultPosition
        defSiz = wx.DefaultSize
        zoom   = 1.2                        # zoom factor of display
        wx.Dialog.__init__ (self, parent, id = wx.ID_ANY,
            title = u"Display with PyMuPDF: ",
            pos = defPos, size = defSiz,
            style = wx.CAPTION|wx.CLOSE_BOX|
                    wx.DEFAULT_DIALOG_STYLE)

        #======================================================================
        # display an icon top left of dialog, append filename to title
        #======================================================================
        if do_icon:
            self.SetIcon(ico_pdf.img.GetIcon())      # set a screen icon
        self.SetTitle(self.Title + filename)
        self.SetBackgroundColour(wx.Colour(240, 230, 140))

        #======================================================================
        # open the document with MuPDF when dialog gets created
        #======================================================================
        self.doc = fitz.open(filename) # create Document object
        if self.doc.needsPass:         # check password protection
            self.decrypt_doc()
        if self.doc.isEncrypted:       # quit if we cannot decrpt
            self.Destroy()
            return
        self.dl_array = [0] * len(self.doc)
        self.last_page = -1            # memorize last page displayed
        self.link_rects = []           # store link rectangles here
        self.link_texts = []           # store link texts here
        self.current_idx = -1          # store entry of found rectangle
        self.current_lnks = []         # store entry of found rectangle

        #======================================================================
        # define zooming matrix for displaying PDF page images
        # we increase images by 20%, so take 1.2 as scale factors
        #======================================================================
        self.matrix = fitz.Matrix(zoom, zoom)    # will use a constant zoom

        '''
        =======================================================================
        Overall Dialog Structure:
        -------------------------
        szr10 (main sizer for the whole dialog - vertical orientation)
        +-> szr20 (sizer for buttons etc. - horizontal orientation)
          +-> button forward
          +-> button backward
          +-> field for page number to jump to
          +-> field displaying total pages
        +-> PDF image area
        =======================================================================
        '''

        # forward button
        self.ButtonNext = wx.Button(self, wx.ID_ANY, u"forw",
                           defPos, defSiz, wx.BU_EXACTFIT)
        # backward button
        self.ButtonPrevious = wx.Button(self, wx.ID_ANY, u"back",
                           defPos, defSiz, wx.BU_EXACTFIT)
        #======================================================================
        # text field for entering a target page. wx.TE_PROCESS_ENTER is
        # required to get data entry fired as events.
        #======================================================================
        self.TextToPage = wx.TextCtrl(self, wx.ID_ANY, u"1", defPos, wx.Size(40, -1), 
                             wx.TE_RIGHT|wx.TE_PROCESS_ENTER)
        # displays total pages and page paper format
        self.statPageMax = wx.StaticText(self, wx.ID_ANY,
                              "of " + str(len(self.doc)) + " pages.",
                              defPos, defSiz, 0)
        self.links = wx.CheckBox( self, wx.ID_ANY, u"show links",
                           defPos, defSiz, wx.ALIGN_LEFT)
        self.links.Value = True
        self.paperform = wx.StaticText(self, wx.ID_ANY, "", defPos, defSiz, 0)
        # define the area for page images and load page 1 for primary display
        self.PDFimage = wx.StaticBitmap(self, wx.ID_ANY, self.pdf_show(1),
                           defPos, defSiz, style = 0)
        #======================================================================
        # the main sizer of the dialog
        #======================================================================
        self.szr10 = wx.BoxSizer(wx.VERTICAL)
        szr20 = wx.BoxSizer(wx.HORIZONTAL)
        szr20.Add(self.ButtonNext, 0, wx.ALL, 5)
        szr20.Add(self.ButtonPrevious, 0, wx.ALL, 5)
        szr20.Add(self.TextToPage, 0, wx.ALL, 5)
        szr20.Add(self.statPageMax, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5)
        szr20.Add( self.links, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5 )
        szr20.Add(self.paperform, 0, wx.ALIGN_CENTER_VERTICAL|wx.ALL, 5)
        # sizer ready, represents top dialog line
        self.szr10.Add(szr20, 0, wx.EXPAND, 5)
        self.szr10.Add(self.PDFimage, 0, wx.ALL, 5)
        # main sizer now ready - request final size & layout adjustments
        self.szr10.Fit(self)
        self.SetSizer(self.szr10)
        self.Layout()
        # center dialog on screen
        self.Centre(wx.BOTH)

        # Bind buttons and fields to event handlers
        self.ButtonNext.Bind(wx.EVT_BUTTON, self.NextPage)
        self.ButtonPrevious.Bind(wx.EVT_BUTTON, self.PreviousPage)
        self.TextToPage.Bind(wx.EVT_TEXT_ENTER, self.GotoPage)
        self.PDFimage.Bind(wx.EVT_MOUSEWHEEL, self.OnMouseWheel)
        self.PDFimage.Bind(wx.EVT_MOTION, self.move_mouse)
        self.PDFimage.Bind(wx.EVT_LEFT_DOWN, self.OnLeftDown)
Exemplo n.º 27
0
 def get_pdf_text(self, path):
     text = ""
     doc = fitz.open(path)
     for page in doc:
         text += page.getText(flags=0) + " "
     return text.lower()
Exemplo n.º 28
0
    while (length < len(lst_text)):
        run = table.cell(2, 1).paragraphs[0].add_run('***')
        run.font.color.rgb = RGBColor(255, 0, 0)
        run = table.cell(2, 1).paragraphs[0].add_run(' ')
        length += 1

    while (length < len(lst_ocr)):
        run = table.cell(2, 1).paragraphs[0].add_run(lst_ocr[length])
        run.font.color.rgb = RGBColor(255, 0, 0)
        run = table.cell(2, 1).paragraphs[0].add_run(' ')
        length += 1


file = os.listdir('uploads')
PDF = 'uploads/' + file[0]
pdfDocument = fitz.open(PDF)
document = Document()
pageNum = pdfDocument.pageCount
textPDF = ''
textOCR = ''
textDiff = ''
documentPDF = "ready/textPDF.txt"
documentOCR = "ready/textOCR.txt"
documentRES = 'ready/RESULT.doc'

# textPDF = textFromPDF(pdfDocument)
doImage(PDF)
for i in range(pageNum):
    file_png = "page" + str(i) + ".jpg"
    tmpTextOCR = text_EASYOCR(file_png)
    # tmpTextOCR = textFromPDF_OCR_1(file_png)
Exemplo n.º 29
0
Export Script toc2csv.py
-------------------------
import fitz
import argparse
#--------------------------------------------------------------------
# use argparse to handle invocation arguments
#--------------------------------------------------------------------
parser = argparse.ArgumentParser(description="Enter CSV delimiter [;] and documment filename")
parser.add_argument('-d', help='CSV delimiter [;]', default = ';')
parser.add_argument('doc', help='document filename')
args = parser.parse_args()
delim = args.d               # requested CSV delimiter character
fname = args.doc          # input document filename

doc = fitz.open(fname)
toc = doc.getToC(simple = False)
ext = fname[-3:].lower()
fname1 = fname[:-4] + "-toc.csv"
outf = open(fname1, "w")
for t in toc:
    t4 = t[3]
    if ext == "pdf":
        if t4["kind"] == 1:
            p4 = str(t4["to"].y)  # add vertical destination if present
        else:
            p4 = ""
    else:
        p4 = ""
    rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4])
    outf.writelines([rec, "\n"])
outf.close()
Exemplo n.º 30
0
    msg = ["%i glyphs" % font.glyph_count, "size %i" % len(font.buffer)]
    if flags["mono"] == 1:
        msg.append("mono")
    if flags["serif"]:
        msg.append("serifed")
    if flags["italic"]:
        msg.append("italic")
    if flags["bold"]:
        msg.append("bold")
    msg = ", ".join(msg)
    return msg


infilename = sys.argv[1]
font_list = set()
doc = fitz.open(infilename)
for i in range(len(doc)):
    for f in doc.getPageFontList(i, full=True):
        msg = ""
        subset, fontname = get_fontnames(doc, f)

        if f[1] == "n/a":
            msg = "Not embedded!"
        else:
            extr = doc.extractFont(f[0])
            font = fitz.Font(fontbuffer=extr[-1])
            msg = make_msg(font)

        if subset:
            msg += ", subset font"
        font_list.add((fontname, msg))
Exemplo n.º 31
0
def fill_in_fake_data_on_exams(paper_dir_path, classlist, outfile, which=None):
    """Fill-in exams with fake data for demo or testing.

    Arguments:
        paper_dir_path {Str or convertable to pathlib obj} -- Directory containing the blank exams.
        classlist (list): ordered list of (sid, sname) pairs.
        outfile {Str} -- Path to write results into this concatenated PDF file.

    Keyword Arguments:
        which {type} -- by default, scribble on all exams or specify
                           something like `which=range(10, 16)` here to scribble on a
                           subset. (default: {None})
    """

    # Customizable data
    blue = [0, 0, 0.75]
    student_number_length = 8
    extra_page_probability = 0.2
    digit_font_size = 24
    answer_font_size = 13
    extra_page_font_size = 18

    # We create the path objects
    paper_dir_path = Path(paper_dir_path)
    out_file_path = Path(outfile)

    print("Annotating papers with fake student data and scribbling on pages...")
    if not which:
        named_papers_paths = glob(
            str(paper_dir_path / "exam_*_*.pdf")
        )  # those with an ID number
        papers_paths = sorted(glob(str(paper_dir_path / "exam_*.pdf")))  # everything
    else:
        papers_paths = sorted(
            [
                paper_dir_path / "exam_{}.pdf".format(str(index).zfill(4))
                for index in which
            ]
        )

    used_id_list = []
    # need to avoid any student numbers already used to name papers - look at file names
    for index, file_name in enumerate(named_papers_paths):
        used_id_list.append(os.path.split(file_name)[1].split(".")[0].split("_")[-1])
    # now load in the student names and numbers -only those not used to prename
    clean_id_dict = {}  # not used
    for sid, sname in classlist:
        if sid not in used_id_list:
            clean_id_dict[sid] = sname

    # now grab a random selection of IDs from the dict.
    # we need len(papers_paths) - len(named_papers_paths) of them
    id_sample = random.sample(
        list(clean_id_dict.keys()), len(papers_paths) - len(named_papers_paths)
    )

    # A complete collection of the pdfs created
    all_pdf_documents = fitz.open()

    clean_count = 0
    for index, file_name in enumerate(papers_paths):
        if file_name in named_papers_paths:
            print("{} - prenamed paper - scribbled".format(os.path.basename(file_name)))
        else:
            student_number = id_sample[clean_count]
            student_name = clean_id_dict[student_number]
            clean_count += 1
            print(
                "{} - scribbled using {} {}".format(
                    os.path.basename(file_name), student_number, student_name
                )
            )

        # TODO: bump pymupdf minimum version to 1.17.2 and do:
        # with fitz.open(file_name) as pdf_document:
        pdf_document = fitz.open(file_name)
        front_page = pdf_document[0]

        # First we input the student names
        if file_name not in named_papers_paths:  # can draw on front page
            # insert digit images into rectangles - some hackery required to get correct positions.
            width = 28
            border = 8
            for digit_index in range(student_number_length):
                rect1 = fitz.Rect(
                    220 + border * digit_index + width * digit_index,
                    265,
                    220 + border * digit_index + width * (digit_index + 1),
                    265 + width,
                )
                uuImg = digit_array[
                    int(student_number[digit_index]) * number_of_digits
                    + random.randrange(number_of_digits)
                ]  # uu-encoded png
                img_BString = base64.b64decode(uuImg)
                front_page.insertImage(rect1, stream=img_BString, keep_proportion=True)
                # TODO - there should be an assert or something here?

            digit_rectangle = fitz.Rect(228, 335, 550, 450)
            insertion_confirmed = front_page.insertTextbox(
                digit_rectangle,
                student_name,
                fontsize=digit_font_size,
                color=blue,
                fontname="Helvetica",
                fontfile=None,
                align=0,
            )
            assert insertion_confirmed > 0

        # Write some random answers on the pages
        for page_index, pdf_page in enumerate(pdf_document):
            random_answer_rect = fitz.Rect(
                100 + 30 * random.random(), 150 + 20 * random.random(), 500, 500
            )
            random_answer_text = random.choice(possible_answers)

            # TODO: "helv" vs "Helvetica"
            if page_index >= 1:
                insertion_confirmed = pdf_page.insertTextbox(
                    random_answer_rect,
                    random_answer_text,
                    fontsize=answer_font_size,
                    color=blue,
                    fontname="helv",
                    fontfile=None,
                    align=0,
                )
                assert insertion_confirmed > 0

        # delete last page from the zeroth test.
        if index == 0:
            pdf_document.deletePage(-1)
            print("Deleting last page of test {}".format(file_name))

        # We then add the pdfs into the document collection
        all_pdf_documents.insertPDF(pdf_document)

        # For a comprehensive test, we will add some extrapages with the probability of 0.2 precent
        if random.random() < extra_page_probability:
            # folder_name/exam_XXXX.pdf or folder_name/exam_XXXX_YYYYYYY.pdf,
            # file_pdf_name drops the folder name and the .pdf parts
            file_pdf_name = os.path.splitext(os.path.basename(file_name))[0]

            # Then we get the test number and student_number from file_pdf_name
            test_number = file_pdf_name.split("_")[1]
            if (
                file_name in named_papers_paths
            ):  # file_pdf_name is exam_XXXX_YYYYYYY.pdf
                student_number = file_pdf_name.split("_")[2]

            print(
                "  making an extra page for test {} and sid {}".format(
                    test_number, student_number
                )
            )
            all_pdf_documents.insertPage(
                -1,
                text="EXTRA PAGE - t{} Q1 - {}".format(test_number, student_number),
                fontsize=extra_page_font_size,
                color=blue,
            )

    # need to use `str(out_file_path)` for pumypdf < 1.16.14
    # https://github.com/pymupdf/PyMuPDF/issues/466
    # Here we only need to save the generated pdf files with random test answers
    all_pdf_documents.save(out_file_path)
    print('Assembled in "{}"'.format(out_file_path))
Exemplo n.º 32
0
It scans through all objects and selects /Type/XObject with /Subtype/Image.
So runtime is determined by number of objects and image volume.
Usage:
extract_img2.py input.pdf
'''
from __future__ import print_function
import fitz
import sys, time, re

checkXO = r"/Type(?= */XObject)"  # finds "/Type/XObject"
checkIM = r"/Subtype(?= */Image)"  # finds "/Subtype/Image"

assert len(sys.argv) == 2, 'Usage: %s <input file>' % sys.argv[0]

t0 = time.clock()
doc = fitz.open(sys.argv[1])
imgcount = 0
lenXREF = doc._getXrefLength()  # number of objects - do not use entry 0!

# display some file info
print("file: %s, pages: %s, objects: %s" %
      (sys.argv[1], len(doc), lenXREF - 1))

for i in range(1, lenXREF):  # scan through all objects
    text = doc._getObjectString(i)  # string defining the object
    isXObject = re.search(checkXO, text)  # tests for XObject
    isImage = re.search(checkIM, text)  # tests for Image
    if not isXObject or not isImage:  # not an image object if not both True
        continue
    pix = fitz.Pixmap(doc, i)  # make pixmap from image
    if pix.colorspace is None:  # this is just a mask!
Exemplo n.º 33
0
    if flags & 2**1:
        l.append("italic")
    if flags & 2**2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2**3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2**4:
        l.append("bold")
    return ", ".join(l)


doc = fitz.open("text-tester.pdf")
page = doc[0]

# read page text as a dictionary, suppressing extra spaces in CJK fonts
blocks = page.get_text("dict", flags=11)["blocks"]
for b in blocks:  # iterate through the text blocks
    for l in b["lines"]:  # iterate through the text lines
        for s in l["spans"]:  # iterate through the text spans
            print("")
            font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                s["font"],  # font name
                flags_decomposer(s["flags"]),  # readable font flags
                s["size"],  # font size
                s["color"],  # font color
            )
            print("Text: '%s'" % s["text"])  # simple print of text
Exemplo n.º 34
0
def generateDoc(title, data, dataStart, dataEnd, bio1, bio2, bio3, bio4,
                vid_filename, textfile, filename, pageNum, path):

    # Creates a new blank PDF
    doc = fitz.open()
    generatedPage = doc.newPage()

    font = "Times-Roman"
    fontSize = 24
    titleLength = fitz.getTextlength(title, font, fontSize)

    # Prints the dimensions of the newly generated page.
    # These values may be useful for determining the locations of the plots
    pageRect = generatedPage.bound()
    page_x0 = pageRect.x0
    page_x1 = pageRect.x1

    # Ensures that the title will always be centered, despite text length
    pageMidpoint_X = (page_x1 - page_x0) / 2
    titleStartPoint_X = pageMidpoint_X - (titleLength / 2)
    titleStartPoint_Y = fontSize + 11
    titleStartPoint = fitz.Point(titleStartPoint_X, titleStartPoint_Y)
    generatedPage.insertText(titleStartPoint,
                             title,
                             fontname=font,
                             fontsize=fontSize,
                             rotate=0)

    # Autogenerates the biometric plots
    bio1_filename = path + "page%i_" % pageNum + bio1 + ".png"
    bioPlotter.plotBiometric(data, dataStart, dataEnd, bio1, bio1_filename)

    bio2_filename = path + "page%i_" % pageNum + bio2 + ".png"
    bioPlotter.plotBiometric(data, dataStart, dataEnd, bio2, bio2_filename)

    bio3_filename = path + "page%i_" % pageNum + bio3 + ".png"
    bioPlotter.plotBiometric(data, dataStart, dataEnd, bio3, bio3_filename)

    bio4_filename = path + "page%i_" % pageNum + bio4 + ".png"
    bioPlotter.plotBiometric(data, dataStart, dataEnd, bio4, bio4_filename)

    # Autogenerates the EEG heatmaps
    eeg.eeg_viz(data, dataStart, dataEnd, path + "page%i_eeg_" % pageNum)
    fontSize = 14

    # Extracts a frame from the video in the specified time range
    # extracted_frame_filename = path + "page%i_extracted_frame.jpg"% pageNum
    # vidFrame.extractFrame(vid_filename, dataStart, dataEnd, extracted_frame_filename)

    # Inserts heatmap visualizations
    heatmapAlpha_Location = fitz.Rect(10, 50, 198, 238)
    generatedPage.insertImage(heatmapAlpha_Location,
                              filename=path + "page%i_eeg_alpha.png" % pageNum,
                              keep_proportion=False)
    alphaText = "Alpha Band"
    textLength = fitz.getTextlength(alphaText, font, fontSize)
    startPoint = fitz.Point(((10 + 94) - textLength / 2), 240)
    generatedPage.insertText(startPoint,
                             alphaText,
                             fontname=font,
                             fontsize=fontSize,
                             rotate=0)

    heatmapBeta_Location = fitz.Rect(203, 50, 391, 238)
    generatedPage.insertImage(heatmapBeta_Location,
                              filename=path + "page%i_eeg_beta.png" % pageNum,
                              keep_proportion=False)
    betaText = "Beta Band"
    textLength = fitz.getTextlength(alphaText, font, fontSize)
    startPoint = fitz.Point(((203 + 94) - textLength / 2), 240)
    generatedPage.insertText(startPoint,
                             betaText,
                             fontname=font,
                             fontsize=fontSize,
                             rotate=0)

    heatmapTheta_Location = fitz.Rect(396, 50, 585, 238)
    generatedPage.insertImage(heatmapTheta_Location,
                              filename=path + "page%i_eeg_theta.png" % pageNum,
                              keep_proportion=False)
    thetaText = "Theta Band"
    textLength = fitz.getTextlength(alphaText, font, fontSize)
    startPoint = fitz.Point(((396 + 94) - textLength / 2), 240)
    generatedPage.insertText(startPoint,
                             thetaText,
                             fontname=font,
                             fontsize=fontSize,
                             rotate=0)

    # Inserts biometric plots
    bio3_Location = fitz.Rect(10, 443, 300, 653)
    generatedPage.insertImage(bio3_Location,
                              filename=bio3_filename,
                              keep_proportion=False)

    bio4_Location = fitz.Rect(305, 443, 595, 653)
    generatedPage.insertImage(bio4_Location,
                              filename=bio4_filename,
                              keep_proportion=False)

    bio1_Location = fitz.Rect(10, 253, 300, 463)
    generatedPage.insertImage(bio1_Location,
                              filename=bio1_filename,
                              keep_proportion=False)

    bio2_Location = fitz.Rect(305, 253, 595, 463)
    generatedPage.insertImage(bio2_Location,
                              filename=bio2_filename,
                              keep_proportion=False)

    # Generates textbox
    textboxBack_Location = fitz.Rect(250, 650, 585, 815)
    textHandler.createTextbox(textfile, textboxBack_Location, generatedPage,
                              path)

    # Inserts video frame
    vidFrame_Location = fitz.Rect(25, 675, 245, 799)
    # generatedPage.insertImage(vidFrame_Location, filename=extracted_frame_filename, keep_proportion=False)
    generatedPage.insertImage(vidFrame_Location,
                              filename=vid_filename,
                              keep_proportion=False)

    # Saves the PDF -- not needed anymore
    #doc.save(filename)

    return doc
Exemplo n.º 35
0
# create color list sorted down by hue, value, saturation
mylist = sorted(getColorInfoList(), reverse = True, key=lambda x: sortkey(x))

w = 800            # page width
h = 600            # page height
rw = 80            # width of color rect
rh = 60            # height of color rect

num_colors = len(mylist)     # number of color triples
black = getColor("black")    # text color
white = getColor("white")    # text color
fsize = 8                    # fontsize
lheight = fsize *1.2         # line height
idx = 0                      # index in color database
doc = fitz.open()            # empty PDF
while idx < num_colors:
    doc.insertPage(-1, width = w, height = h)    # new empty page
    page=doc[-1]                                 # load it
    for i in range(10):                          # row index
        if idx >= num_colors:
            break
        for j in range(10):                      # column index
            rect = fitz.Rect(rw*j, rh*i, rw*j + rw, rh*i + rh)  # color rect
            cname = mylist[idx][0].lower()       # color name
            col = mylist[idx][1:]                # color tuple -> to floats
            col = (col[0] / 255., col[1] / 255., col[2] / 255.)
            page.drawRect(rect, color = col, fill = col)   # draw color rect
            pnt1 = rect.top_left + (0, rh*0.3)   # pos of color name in white
            pnt2 = pnt1 + (0, lheight)           # pos of color name in black
            page.insertText(pnt1, cname, fontsize = fsize, color = white)
Exemplo n.º 36
0
def getText(file):
    pdf = fitz.open(file)
    # numPages = pdf.pageCount
    page = pdf.loadPage(0)
    pageText = page.getText("text")
    return pageText
Exemplo n.º 37
0
line = 0


def get_info(tx, par):
    if par == 'I':
        for ln in tx:
            if ln[0:15] == 'Certificate Id:' or ln[0:15] == 'Certificate No:':
                return ln[16:]
    elif par == 'C':
        for ln in tx:
            if ln[0:19] == 'Course completed on':
                return ln


for certificate in os.listdir(path=pth):
    if certificate.endswith('.pdf'):
        line += 1

        pdfFileObj = fitz.open(pth + "/" + certificate)
        pageObj = pdfFileObj.loadPage(0)
        pageText = pageObj.getText()

        textlines = pageText.split('\n')

        worksheet["A" + str(line)] = textlines[2]
        worksheet["B" + str(line)] = get_info(textlines, 'C')
        worksheet["C" + str(line)] = get_info(textlines, 'I')

workbook.save("Certificates.xlsx")
Exemplo n.º 38
0
    # 4:Sig  2:NbDisk 2: NbCD 2:TotalDisk 2:TotalCD
    # 	4:CDSize 4:Offset 2:ComLen
    offset = filedata.rfind("PK\5\6") + 20

    # new comment length
    length = len(filedata) - offset - 2

    with open(name, "wb") as f:
        f.write(filedata[:offset])
        f.write(struct.pack("<H", length))
        f.write(filedata[offset + 2:])


pdf, attach = sys.argv[1:3]

doc = fitz.open(pdf)

with open(attach, 'rb') as f:
    data = f.read()

if ATTACHED:
    # add as attachment
    createAttachment(doc, attach, data)
    doc.saveIncr()
else:
    # add as extra stream
    # appending one null byte to terminate the archive comment
    addStreamData(doc, data + "\0")
    # 255 = decompress all objects
    doc.save(doc.name, incremental=True, expand=255)
doc.close()
Exemplo n.º 39
0
driver.close()

#%% EXTRACTING TEXT BLOCKS
'''
This block extracts all paragraphs from EY VAT tax guides
'''

files = os.listdir(main_dir + tax_guides)  # EY worldwide tax guide directory
personal_ind = 'worldwide-vat-gst-and-sales-tax-guide-'  # VAT tax guide indicator

text = []
filename_aux = []
for file in files:
    if file[0:len(personal_ind)] == personal_ind:  # only corporate tax guides
        doc = fitz.open(file)
        num_pages = doc.pageCount
        for page in range(0, num_pages):
            text_ = doc.loadPage(page).getText("blocks")
            text_ = [x[4] for x in text_]  # 4 = element with string in tuple
            text.extend(text_)
            for te in text_:
                filename_aux.append(
                    file)  # allows to identify tax guide and year

#%% GETTING YEAR + COUNTRYNAME
'''
In this block I will identify the country and the year from both the filename
and the text within each file.
'''
Exemplo n.º 40
0
            pix2.n == 1):
        print("unexpected /SMask situation: pix1", pix1, "pix2", pix2)
        return pix1
    pix = fitz.Pixmap(pix1)       # copy of pix1, alpha channel added
    pix.setAlpha(pix2.samples)    # treat pix2.samples as alpha values
    pix1 = pix2 = None            # free temp pixmaps
    return pix

checkXO = r"/Type(?= */XObject)"       # finds "/Type/XObject"   
checkIM = r"/Subtype(?= */Image)"      # finds "/Subtype/Image"

t0 = time.clock()

fname = sys.argv[1] # file name
fpref = sys.argv[2] # image file prefix
doc = fitz.open(sys.argv[1])
imgcount = 0
lenXREF = doc._getXrefLength()         # object count - do not use entry 0!

# display some file info
print("")
print(__file__, "PDF: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1))

smasks = [] # stores xrefs of /SMask objects
#------------------------------------------------------------------------------
# loop through PDF images
#------------------------------------------------------------------------------
for i in range(1, lenXREF):            # scan through all objects
    try:
        text = doc._getObjectString(i) # PDF object definition string
    except:
Exemplo n.º 41
0
from __future__ import print_function
import fitz
import sys

#==============================================================================
# Pie Chart program - semi circle version
#==============================================================================
from fitz.utils import getColor        # for getting RGB colors by name
doc = fitz.open()                      # new empty PDF
doc.insertPage()                       # creates an ISO-A4 page
page = doc[-1]                         # this is the page
img = page.newShape()
# title of the page
title = "Sitzverteilung nach der Bundestagswahl 2013"
# pie chart center and point of 1st data pie
center = fitz.Point(200, 250)
point  = fitz.Point(100, 250)          # will cycle through table data
# this is the radius
radius = abs(point - center)

blue = getColor("blue")                # we need some colors
white = getColor("white")

lineheight = 20                        # legend line height
ts_v  = 150                            # vertical start of legend block
ts_h  = center.x + radius + 50         # horizontal coord of legend block

# these are the data to visualize:
# number of seats of political parties in German parliament since 2013
table  = (       # seats, party color & name 
          (64, "violetred", "Die Linke"),
Exemplo n.º 42
0
import fitz
doc = fitz.open("some.pdf")      # open pdf
page = doc[n]                    # open the page (0-based number)
rtab = []                        # store all rectangles here
annot = page.firstAnnot          # read first annotation
while annot:
	rtab.append(annot.rect)  # store rectangle
	annot = annot.next       # read next annot

annot = page.firstAnnot          # cycle thru annots again
for rect in reversed(rtab):
	annot.setRect(rect)      # give it a new place
	annot = annot.next       
	
doc.save("some-reversed.pdf")    # save PDF with reversed annotations
Exemplo n.º 43
0
# -*- coding: utf-8 -*-
"""
PyMuPDF Example Script:
------------------------

Split a given PDF into separate files of one page each.
For "input.pdf" the generated files are named "input-%i.pdf".

PyMuPDF license
"""

import fitz
import sys
fn = sys.argv[1]

fn1 = fn[:-4]

src = fitz.open(fn)

for i in range(len(src)):
    doc = fitz.open()
    doc.insertPDF(src, from_page = i, to_page = i)
    doc.save("%s-%i.pdf" % (fn1, i))
    doc.close()
    
Exemplo n.º 44
0
def pdf2pic(pdf_path, pic_path):
    """
    # 从pdf中提取图片
    :param pdf_path: pdf的路径
    :param pic_path: 图片保存的路径
    :return:
    """
    # pic_path = r'C:\Users\big\Desktop\tt'

    t0 = time.perf_counter()  # 生成图片初始时间

    # 使用正则表达式来查找图片
    checkXO = r"/Type(?= */XObject)"
    checkIM = r"/Subtype(?= */Image)"

    doc = fitz.open(pdf_path)  # 打开pdf文件
    img_count = 0  # 图片计数
    len_XREF = doc._getXrefLength()  # 获取对象数量长度

    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(pdf_path, len(doc), len_XREF - 1))

    c1, c2 = 170, 50
    # 遍历每一个对象
    # for i in range(1, 7780):
    # for i in range(len_XREF - 1, 0, -1):
    for i in range(1, len_XREF):
        text = doc._getXrefString(i)  # 定义对象字符串
        isXObject = re.search(checkXO, text)  # 使用正则表达式查看是否是对象
        isImage = re.search(checkIM, text)  # 使用正则表达式查看是否是图片

        if not isXObject or not isImage:  # 如果不是对象也不是图片,则continue
            continue

        img_count += 1
        # print(i, img_count)
        # continue

        pix = fitz.Pixmap(doc, i)  # 根据索引生成图像 生成图像对象
        # print(type(pix), pix.w, pix.pixel(1, 2))
        # return

        for x in range(pix.w):
            for y in range(pix.h):
                if pix.pixel(x, y)[0] > c1:
                    pix.setPixel(x, y, [254, 254, 254])
                else:
                    pix.setPixel(x, y, [c2, c2, c2])

        # print(pix.pixel(1, 2))

        # # 根据pdf的路径生成图片的名称
        # # new_name = pdf_path.replace('\\', '_') + "_img{}.png".format(imgcount)
        # # new_name = new_name.replace(':', '')
        new_name = os.path.join(pic_path, f'{img_count}.png')
        # # print(new_name)
        # # new_name = "图片{}.png".format(imgcount)  # 生成图片的名称
        #
        if pix.n < 5:  # 如果pix.n<5,可以直接存为PNG
            pix.writePNG(new_name)
        else:  # 否则先转换CMYK
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(new_name)
            pix0 = None

        pix = None  # 释放资源
        print("提取了第{}张图片".format(img_count))

        # t1 = time.perf_counter()  # 图片完成时间
        # print("运行时间:{}s".format(t1 - t0))

        # img = Image.open(os.path.join(pic_path, new_name))
        # # print(type(img))
        # img = img.convert('RGBA')
        # pixdata = img.load()

        # for y in range(img.size[1]):
        #     for x in range(img.size[0]):
        #         if pixdata[x, y][0] > c1 and pixdata[x, y][1] > c1 and pixdata[x, y][2] > c1:
        #             pixdata[x, y] = (255, 255, 255, 255)
        #         else:
        #             pixdata[x, y] = (c2, c2, c2, 255)
        #             # pixdata[x, y] = (pixdata[x, y][0] - c2, pixdata[x, y][1] - c2, pixdata[x, y][2] - c2, 255)
        #
        # img.show()

    doc.close()

    t1 = time.perf_counter()  # 图片完成时间
    print("总共提取了{}张图片".format(img_count))
    print("运行时间:{}s".format(t1 - t0))
Exemplo n.º 45
0
# Command line:
# python embedded-import.py some.pdf embed.file
#------------------------------------------------------------------------------

parser = argparse.ArgumentParser(description="Enter PDF, file to embed, and optional name, description and output pdf.")
parser.add_argument('pdf', help='PDF filename')
parser.add_argument('file', help='name of embedded file')
parser.add_argument('-n', "--name", help='name for embedded file entry (default: file)')
parser.add_argument('-d', "--desc", help='description (default:  file)')
parser.add_argument('-o', "--output", help = 'output PDF (default: modify pdf)')
args = parser.parse_args()
delim = args.desc               # requested CSV delimiter character
pdffn = args.pdf
impfn = args.file

doc = fitz.open(pdffn)
if not args.name:
    name = impfn
desc = args.desc
if not args.desc:
    desc = impfn

# to be on the safe side, always open as binary
content = open(impfn, "rb").read()  # read all file content in

# import the file into the PDF
doc.embeddedFileAdd(content, name, impfn, desc)
# save PDF (either incremental or to new PDF file)
if not args.output:
    doc.saveIncr()
else:
Exemplo n.º 46
0
from __future__ import print_function
import fitz
import sys, os, subprocess, tempfile, time
'''
Optimizes a PDF with FileOptimizer. But as "/Producer" and "/Creator" get
spoiled by this, we first save metadata and restore it after optimization.
This means we also accept non-compressed object definitions (as created by 
FileOptimizer).
'''
assert len(sys.argv) == 2, "need filename parameter"
fn = sys.argv[1]
assert fn.lower().endswith(".pdf"), "must be a PDF file"

fullname = os.path.abspath(fn)         # get the full path & name
t0 = time.clock()                      # save current time
doc = fitz.open(fullname)              # open PDF to save metadata
meta = doc.metadata
doc.close()

t1 = time.clock()                      # save current time again
subprocess.call(["fileoptimizer64", fullname])   # now invoke FileOptimizer
t2 = time.clock()                      # save current time again

cdir = os.path.split(fullname)[0]      # split dir from filename
fnout = tempfile.mkstemp(suffix = ".pdf", dir = cdir) # create temp pdf name 
doc = fitz.open(fullname)              # open now optimized PDF
doc.setMetadata(meta)                  # restore old metadata
doc.save(fnout[1], garbage = 4)        # save temp PDF with it, a little sub opt
doc.close()                            # close it

os.remove(fn)                          # remove super optimized file
Exemplo n.º 47
0
from __future__ import print_function
import fitz
import argparse
#--------------------------------------------------------------------
# use argparse to handle invocation arguments
#--------------------------------------------------------------------
parser = argparse.ArgumentParser(description="Enter CSV delimiter [;] and documment filename")
parser.add_argument('-d', help='CSV delimiter [;]', default = ';')
parser.add_argument('doc', help='document filename')
args = parser.parse_args()
delim = args.d               # requested CSV delimiter character
fname = args.doc          # input document filename

doc = fitz.open(fname)
toc = doc.getToC(simple = False)
ext = fname[-3:].lower()
fname1 = fname[:-4] + "-toc.csv"
outf = open(fname1, "w")
for t in toc:
    t4 = t[3]
    if ext == "pdf":
        if t4["kind"] == 1:
            p4 = str(t4["to"].y)
        else:
            p4 = "0"
    else:
        p4 = "0"
    rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4])
    outf.writelines([rec, "\n"])
outf.close()
Exemplo n.º 48
0
import fitz                           # import PyMuPDF
doc = fitz.open("some.pdf")           # or new: fitz.open(), followed by insertPage()
page = doc[n]                         # choose some page
rect = fitz.Rect(50, 100, 300, 400)   # rectangle (left, top, right, bottom) in pixels

text = """This text will only appear in the rectangle. Depending on width, new lines are generated as required.\n<- This forced line break will also appear.\tNow a very long word: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.\nIt will be broken into pieces."""

rc = page.insertTextbox(rect, text, fontsize = 12, # choose fontsize (float)
                   fontname = "Times-Roman",       # a PDF standard font
                   fontfile = None,                # could be a file on your system
                   align = 0)                      # 0 = left, 1 = center, 2 = right

print("unused rectangle height: %g" % rc)          # just demo (should display "44.2")

doc.saveIncr()   # update file. Save to new instead by doc.save("new.pdf",...)
Exemplo n.º 49
0
(2) Easily adapt the example to combine just 2 pages (like for a booklet) or
    make the output page dimension dependent on input, or whatever.

(3) This should run very fast: needed less than 25 sec on a Python 3.6 64bit,
    Windows 10, AMD 4.0 GHz for the 1'310 pages of the Adobe manual.
    Without save-options "garbage" and "deflate" this goes below 4 seconds, but
    results in a bigger file.
    
Dependencies
-------------
PyMuPDF 1.12.1 or later
'''
from __future__ import print_function
import fitz, sys
infile = sys.argv[1]
src = fitz.open(infile)
doc = fitz.open()                      # empty output PDF

width, height = fitz.PaperSize("a4")   # A4 portrait output page format
r = fitz.Rect(0, 0, width, height)

# define the 4 rectangles per page
r1 = r * 0.5                           # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0)   # top right
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left
r4 = fitz.Rect(r1.br, r.br)            # bottom right

# put them in a list
r_tab = [r1, r2, r3, r4]

# now copy input pages to output
Exemplo n.º 50
0
# Работаем с ПДФ файлами
"""
1. Выведем только первые 5 страниц"""
import fitz  # Фитз входит в состав библиотеки PyMuPDF
spisok = list(range(5))  # Список с номерами первых 5 страниц
docu = fitz.open("SCAN.pdf")
docu.select(spisok)  # Удаляются все, кроме 5 страниц
docu.save("SCAN_NEW.pdf", garbage=3)
docu.close()
"""
import  fitz
stroka_1="ABCD"; stroka_2="EFGH"; stroka_3="IJKL"
new_docu=fitz.open()
new_docu.insertPage(text=stroka_1, fontsize=11)
new_docu.insertPage(text=stroka_2, fontsize=20)
new_docu.insertPage(text=None, fontsize=20)
new_docu.insertPage(text=stroka_3, fontsize=20)
new_docu.save("NewFile.pdf", garbage=3)
new_docu_2=fitz.open("NewFile.pdf")
spisok2=list(range(new_docu_2.pageCount))
for page_number in spisok2:
    if not new_docu.getPageText(page_number):
        spisok2.remove(page_number)
new_docu_2.select(spisok2)
new_docu_2.save("NewFileResult.pdf", garbage=3)
new_docu.close()
new_docu_2.close()

"""
Created on Thu Jul 16 12:50:03 2020

@author: Austin.Schrader
"""

import fitz
import os

imglist = [r'C:\\Users\\austin.schrader\\Desktop\\My_Desktop_Documents\\Python_Tools\\reading_emails_parse_attachment\\Attachments\\MI Payment from ACH Report.png']
pdf_path = r'C:\Users\austin.schrader\Desktop\My_Desktop_Documents\Python_Tools\reading_emails_parse_attachment\Attachments\MI Payment from ACH Report.pdf'

for root, dirs, files in os.walk(r'C:\\Users\\austin.schrader\\Desktop\\My_Desktop_Documents\\Python_Tools\\reading_emails_parse_attachment\\Attachments'):
    for f in files:
        if f.endswith(".png"):
            try:
                doc = fitz.open()                            # PDF with the pictures
                for f in imglist:
                    img = fitz.open(f) # open pic as document
                    rect = img[0].rect                       # pic dimension
                    pdfbytes = img.convertToPDF()            # make a PDF stream
                    img.close()                              # no longer needed
                    imgPDF = fitz.open("pdf", pdfbytes)      # open stream as PDF
                    page = doc.newPage(width = rect.width,   # new page with ...
                                       height = rect.height) # pic dimension
                    page.showPDFpage(rect, imgPDF, 0) 
                           # image fills the page
                doc.save(pdf_path)
                os.remove(os.path.join(root, f))
            except:
                pass
Exemplo n.º 52
0
filename = '~/Sync/literature/村上春树_世界尽头与冷酷仙境_2007.pdf'
outname = 'world.pdf'
lineH = 15.0

# chap_start = 7
# chap_end = 256

# intermediate
locfile = 'haha.pdf'
xmpdf = 'xm.pdf'
xmcpdf = 'xmc.pdf'

cmd = 'cp ' + filename + ' ' + locfile
os.system(cmd)
doc = fitz.open(locfile)

# doc.delete_pages(chap_end, -1)
# doc.delete_pages(0, chap_start - 2)

# repeat every page two times
for origin_pagenum in range(len(doc)):
    pagenum = origin_pagenum * 2
    doc.fullcopy_page(pagenum, pagenum)

doc.save(xmpdf)
doc.close()

# crop each page
crop(["-b", "c", "-ap", "73", "-p", "5", "-v", xmpdf, "-o", xmcpdf])
Exemplo n.º 53
0
Usage:
------
extract_img4.py input.file

'''
from __future__ import print_function
import fitz

import hashlib
import sys, time

assert len(sys.argv) == 2, 'Usage: %s <input file>' % sys.argv[0]
    
t0 = time.clock() if str is bytes else time.perf_counter()
doc = fitz.open(sys.argv[1])           # the PDF
imgcount = 0                           # counts extracted images
hash_list = []                         # records images already extracted

# display some file info
print("file: %s, pages: %i" % (sys.argv[1], len(doc)))

for page in doc:                  # cycle through the document's pages
    js = page.getText("dict")     # get a page's content in dict format
    blocks = js["blocks"]         # we are interested in the blocks
    j = 0                         # counts images per page

    for b in blocks:
        if b["type"] != 1:        # not an image block
            continue 
        fname = "p%i-%i." % (page.number, j)     # file names look like so
Exemplo n.º 54
0
        )[SOURCE_EXCEL_START_ROW]:
            if row[0].value.strip() != '':
                name_list.append(row[config_module.get_config_obj()
                                     [SOURCE_EXCEL_FILE_NAME_COLUMN]].value)
        row_index = row_index + 1
    return name_list


print('正在初始化程序数据...')
global_data_module.init()
print('正在解析配置文件并读取Excel...')
# 读取Excel
file_names = read_asin_data(config_module.get_config_obj()[SOURCE_EXCEL])
print('正在解码PDF...')
# 读取PDF
pdf_doc = fitz.open(config_module.get_config_obj()[SOURCE_PDF])
print('PDF页数:' + str(pdf_doc.pageCount) + ', EXCEL 记录数:' +
      str(len(file_names)))
if pdf_doc.pageCount != len(file_names):
    exit_module.tip_and_wait_then_exit('Excel的记录数量与PDF页数不匹配,无法继续操作')
mat = fitz.Matrix(config_module.get_config_obj()[IMAGE_SCALE],
                  config_module.get_config_obj()[IMAGE_SCALE])
if not os.path.exists(config_module.get_config_obj()[OUTPUT_DIR]):
    os.makedirs(config_module.get_config_obj()[OUTPUT_DIR])
for page_index in range(pdf_doc.pageCount):
    pdf_page = pdf_doc[page_index]
    print('开始转换写出第 ' + str(page_index + 1) + ' / ' + str(len(file_names)) +
          ' 页图片数据')
    pdf_page.getPixmap(
        matrix=mat).writePNG(config_module.get_config_obj()[OUTPUT_DIR] +
                             os.sep + file_names[page_index] + '.png')
Exemplo n.º 55
0
For XPS and EPUB input, internal links however **are** of type "LINK_NAMED".
Base library MuPDF does not resolve them to page numbers.

So, for anyone expert enough to know the internal structure of these
document types, can further interpret and resolve these link types.

Dependencies
--------------
PyMuPDF v1.13.3
"""
import fitz
import sys
if not (list(map(int, fitz.VersionBind.split("."))) >= [1,13,3]):
    raise SystemExit("insufficient PyMuPDF version")
fn = sys.argv[1]
doc = fitz.open(fn)
if doc.isPDF:
    raise SystemExit("document is PDF already")
print("Converting '%s' to '%s.pdf'" % (fn, fn))
b = doc.convertToPDF()            # convert to pdf
pdf = fitz.open("pdf", b)         # open as pdf

toc= doc.getToC()                 # table of contents of input
pdf.setToC(toc)                   # simply set it for output
meta = doc.metadata               # read and set metadata
if not meta["producer"]:
    meta["producer"] = "PyMuPDF v" + fitz.VersionBind

if not meta["creator"]:
    meta["creator"] = "PyMuPDF PDF converter"
Exemplo n.º 56
0
from __future__ import print_function
import fitz
import argparse
#--------------------------------------------------------------------
# use argparse to handle invocation arguments
#--------------------------------------------------------------------
parser = argparse.ArgumentParser(
    description="Enter CSV delimiter [;] and documment filename")
parser.add_argument('-d', help='CSV delimiter [;]', default=';')
parser.add_argument('doc', help='document filename')
args = parser.parse_args()
delim = args.d  # requested CSV delimiter character
fname = args.doc  # input document filename

doc = fitz.open(fname)
toc = doc.getToC(simple=False)
ext = fname[-3:].lower()
fname1 = fname[:-4] + "-toc.csv"
outf = open(fname1, "w")
for t in toc:
    t4 = t[3]
    if ext == "pdf":
        if t4["kind"] == 1:
            p4 = str(t4["to"].y)
        else:
            p4 = ""
    else:
        p4 = ""
    rec = delim.join([str(t[0]), t[1].strip(), str(t[2]), p4])
    outf.writelines([rec, "\n"])
outf.close()
Exemplo n.º 57
0
import calendar
import sys
assert len(sys.argv) == 2, "need start year as the one and only parameter"
startyear = sys.argv[1]

assert startyear.isdigit(), "year must be positive numeric"
startyear = int(startyear)

assert startyear > 0, "year must be positive numeric"

# We use a nicer mono-spaced font than the PDF builtin 'Courier'.
# If you do not know one, set ffile to None and fname to 'Courier'
ffile = "c:/windows/fonts/dejavusansmono.ttf"
fname = "F0"

doc = fitz.open()
cal = calendar.LocaleTextCalendar(locale = "es") # use your locale
#cal = calendar.TextCalendar()                   # or stick with English

w, h = fitz.PaperSize("a4-l")          # get sizes for A4 landscape paper

txt = cal.formatyear(startyear, m = 4)
doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile,
               width = w, height = h)

txt = cal.formatyear(startyear + 1, m = 4)
doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile,
               width = w, height = h)

txt = cal.formatyear(startyear + 2, m = 4)
doc.insertPage(-1, txt, fontsize = 12, fontname = fname, fontfile = ffile,
Exemplo n.º 58
0
import os
import fitz

from mamba import description, it, before
from fitzutils import ToCEntry
from pdftocio.tocio import read_toc, write_toc

dirpath = os.path.dirname(os.path.abspath(__file__))

level2 = os.path.join(dirpath, "files/level2.pdf")
hastoc = os.path.join(dirpath, "files/hastoc.pdf")

with description("read_toc") as self:
    with before.all:
        self.doc = fitz.open(level2)
        self.reference = fitz.open(hastoc)
        self.expect = [
            ToCEntry(level=1, title='Section One', pagenum=1),
            ToCEntry(level=1, title='Section Two', pagenum=1),
            ToCEntry(level=2, title='Subsection Two.One', pagenum=2),
            ToCEntry(level=1,
                     title='Section Three, with looong loooong looong title',
                     pagenum=3),
            ToCEntry(
                level=2,
                title='Subsection Three.One, '
                'with even loooooooooooonger title, and probably even more',
                pagenum=3),
            ToCEntry(level=2, title='Subsection Three.Two', pagenum=4),
            ToCEntry(level=2, title='Subsection Three.Three', pagenum=5),
            ToCEntry(level=1, title='The End', pagenum=5)
Exemplo n.º 59
0
from __future__ import print_function
import sys
import fitz
#------------------------------------------------------------------------------
# Example program
# License: GNU GPL V3
# Extracts an embedded file from an existing PDF
# Command line:
# python embedded-export.py input.pdf name export.file
#------------------------------------------------------------------------------
pdffn = sys.argv[1]               # PDF file name
name  = sys.argv[2]               # embedded file identifier
expfn = sys.argv[3]               # filename of exported file

doc = fitz.open(pdffn)            # open PDF
outfile = open(expfn, "wb")       # to be on the safe side always open binary

# extract file content. Will get exception on any error.
content = doc.embeddedFileGet(name)

outfile.write(content)
outfile.close()
Exemplo n.º 60
0
    def __init__(self, url, config_dir, background_color):
        super(PdfViewerWidget, self).__init__()

        self.url = url
        self.config_dir = config_dir
        self.background_color = background_color
        self.installEventFilter(self)
        self.setMouseTracking(True)

        # Load document first.
        self.document = fitz.open(url)

        # Get document's page information.
        self.first_pixmap = self.document.getPagePixmap(0)
        self.page_width = self.first_pixmap.width
        self.page_height = self.first_pixmap.height
        self.page_total_number = self.document.pageCount

        # Init scale and scale mode.
        self.scale = 1.0
        self.read_mode = "fit_to_width"

        # Inverted mode.
        self.inverted_mode = False

        # mark link
        self.is_mark_link = False
        self.mark_link_annot_cache_dict = {}

        #jump link
        self.jump_link_key_cache_dict = {}
        self.jump_link_annot_cache_dict = {}

        #global search text
        self.is_mark_search = False
        self.search_text_offset_list = []
        self.search_text_annot_cache_dict = {}

        # select text
        self.is_select_mode = False
        self.start_char_rect_index = None
        self.start_char_page_index = None
        self.last_char_rect_index = None
        self.last_char_page_index = None
        self.select_area_annot_cache_dict = {}
        self.char_dict = {k: None for k in range(self.page_total_number)}

        # Init scroll attributes.
        self.scroll_step = 20
        self.scroll_offset = 0
        self.mouse_scroll_offset = 20

        # Padding between pages.
        self.page_padding = 10

        # Init font.
        self.page_annotate_height = 22
        self.page_annotate_padding_right = 10
        self.page_annotate_padding_bottom = 10
        self.page_annotate_light_color = QColor("#333333")
        self.page_annotate_dark_color = QColor("#999999")
        self.font = QFont()
        self.font.setPointSize(12)

        # Page cache.
        self.page_cache_pixmap_dict = {}
        self.page_cache_scale = self.scale
        self.page_cache_trans = None
        self.page_cache_context_delay = 1000

        self.last_action_time = 0

        self.is_page_just_changed = False

        self.remember_offset = None

        # Save table in file for search framework, such as snails, search table to navigate.
        table_info = ""
        for info in self.document.getToC():
            indentation_num = info[0]
            title = info[1]
            page = info[2]

            table_info += str(page) + self.repeat_to_length(
                " ", indentation_num * 4) + title + "\n"

        table_file_hash = hashlib.md5(self.url.encode()).hexdigest()
        self.table_file_path = os.path.join(config_dir, "pdf-viewer", "table",
                                            table_file_hash)
        touch(self.table_file_path)
        with open(self.table_file_path, "w") as f:
            f.write(table_info)