Пример #1
0
    def import_doc(file_uri, config, docsearch, current_doc=None):
        """
        Import the specified PDF files
        """
        logger.info("Importing PDF from '%s'" % (file_uri))
        parent = Gio.File.parse_name(file_uri)
        doc = None

        idx = 0

        for child in MultiplePdfImporter.__get_all_children(parent):
            if not child.get_basename().lower().endswith(".pdf"):
                continue
            try:
                # make sure we can import it
                Poppler.Document.new_from_file(child.get_uri(), password=None)
            except Exception:
                continue
            doc = PdfDoc(config.workdir)
            doc.import_pdf(config, child.get_uri())
            for page in doc.pages:
                docsearch.index_page(page)
            idx += 1

        assert (doc is not None)
        return (doc, doc.pages[0])
Пример #2
0
    def import_doc(file_uri, docsearch, current_doc=None):
        """
        Import the specified PDF files
        """
        logger.info("Importing PDF from '%s'" % (file_uri))
        parent = Gio.File.parse_name(file_uri)
        doc = None
        docs = []

        idx = 0

        for child in MultiplePdfImporter.__get_all_children(parent):
            if not child.get_basename().lower().endswith(".pdf"):
                continue
            if docsearch.is_hash_in_index(PdfDoc.hash_file(child.get_path())):
                logger.info("Document %s already found in the index. Skipped"
                            % (child.get_path()))
                continue
            try:
                # make sure we can import it
                Poppler.Document.new_from_file(child.get_uri(),
                                               password=None)
            except Exception:
                continue
            doc = PdfDoc(docsearch.rootdir)
            doc.import_pdf(child.get_uri())
            docs.append(doc)
            idx += 1
        if doc is None:
            return (None, None, False)
        else:
            return (docs, None, True)
Пример #3
0
    def import_doc(self, file_uri, config, docsearch, current_doc=None):
        print ("Importing doc '%s'" % (file_uri))
        parent = Gio.File.parse_name(file_uri)
        doc = None

        idx = 0

        for child in self.__get_all_children(parent):
            if not child.get_basename().lower().endswith(".pdf"):
                continue
            try:
                # make sure we can import it
                Poppler.Document.new_from_file(child.get_uri(),
                                               password=None)
            except Exception:
                continue
            doc = PdfDoc(config.workdir)
            doc.path += ("_%02d" % idx)
            doc.docid += ("_%02d" % idx)
            doc.import_pdf(config, child.get_uri())
            for page in doc.pages:
                docsearch.index_page(page)
            idx += 1

        assert(doc != None)
        return (doc, doc.pages[0])
Пример #4
0
    def import_doc(file_uri, config, docsearch, current_doc=None):
        """
        Import the specified PDF files
        """
        logger.info("Importing PDF from '%s'" % (file_uri))
        parent = Gio.File.parse_name(file_uri)
        doc = None

        idx = 0

        for child in MultiplePdfImporter.__get_all_children(parent):
            if not child.get_basename().lower().endswith(".pdf"):
                continue
            try:
                # make sure we can import it
                Poppler.Document.new_from_file(child.get_uri(),
                                               password=None)
            except Exception:
                continue
            doc = PdfDoc(config.workdir)
            doc.import_pdf(config, child.get_uri())
            for page in doc.pages:
                docsearch.index_page(page)
            idx += 1

        assert(doc is not None)
        return (doc, doc.pages[0])
Пример #5
0
    def import_doc(self, file_uri, config, docsearch, current_doc=None):
        print("Importing doc '%s'" % (file_uri))
        parent = Gio.File.parse_name(file_uri)
        doc = None

        idx = 0

        for child in self.__get_all_children(parent):
            if not child.get_basename().lower().endswith(".pdf"):
                continue
            try:
                # make sure we can import it
                Poppler.Document.new_from_file(child.get_uri(), password=None)
            except Exception:
                continue
            doc = PdfDoc(config.workdir)
            doc.path += ("_%02d" % idx)
            doc.docid += ("_%02d" % idx)
            doc.import_pdf(config, child.get_uri())
            for page in doc.pages:
                docsearch.index_page(page)
            idx += 1

        assert (doc != None)
        return (doc, doc.pages[0])
Пример #6
0
 def import_doc(file_uri, docsearch, current_doc=None):
     """
     Import the specified PDF file
     """
     doc = PdfDoc(docsearch.rootdir)
     logger.info("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(file_uri)
     return ([doc], None, True)
Пример #7
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified PDF file
     """
     doc = PdfDoc(config.settings['workdir'].value)
     logger.info("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(config, file_uri)
     return ([doc], None, True)
Пример #8
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     doc = PdfDoc(config.workdir)
     print ("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         print ("Indexing page %s:p%d ..." % (file_uri, page.page_nb))
         docsearch.index_page(page)
     return (doc, doc.pages[0])
Пример #9
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     doc = PdfDoc(config.workdir)
     print("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         print("Indexing page %s:p%d ..." % (file_uri, page.page_nb))
         docsearch.index_page(page)
     return (doc, doc.pages[0])
Пример #10
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified PDF file
     """
     doc = PdfDoc(config.workdir)
     logger.info("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         logger.info("Indexing page %s:p%d ..." % (file_uri, page.page_nb))
         docsearch.index_page(page)
     return (doc, doc.pages[0])
Пример #11
0
 def import_doc(file_uri, config, docsearch, current_doc=None):
     """
     Import the specified PDF file
     """
     doc = PdfDoc(config.workdir)
     logger.info("Importing doc '%s' ..." % file_uri)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         logger.info("Indexing page %s:p%d ..." % (file_uri, page.page_nb))
         docsearch.index_page(page)
     return (doc, doc.pages[0])
Пример #12
0
    def split_pages(self, pages):
        """
        Split the document at these page.
        """

        # You can't leave empty documents
        if 0 in pages:
            pages.remove(0)
        if not pages:
            return

        logger.info("Splitting %s at %s", self.docid, pages)

        # Poppler can't work with individual pages, thus we use pdfrw.
        from paperwork.backend.pdf.doc import PDF_FILENAME, PdfDoc
        from paperwork.backend.docimport import SinglePdfImporter
        import pdfrw

        doc_pages = self.pages[:]
        pdir = os.path.abspath(os.path.join(self.path, os.path.pardir))
        new_docs = []

        pdf_r_name = os.path.join(self.path, PDF_FILENAME)
        pdf_a_name = os.path.join(self.path, PDF_FILENAME + '.new')
        pdf_r = pdfrw.PdfReader(pdf_r_name)
        dest = pdfrw.PdfWriter()
        dest_path = pdf_a_name

        offset = 0
        for pdf_page, page in zip(pdf_r.pages, doc_pages):
            if page.page_nb in pages:
                dest.write(dest_path)

                new_doc = PdfDoc(pdir, label_store=self.label_store)
                os.mkdir(new_doc.path)
                new_doc.labels = self.labels.copy()
                dest = pdf_b = pdfrw.PdfWriter()
                dest_path = os.path.join(new_doc.path, PDF_FILENAME)
                new_docs.append(new_doc)
                offset = page.page_nb
            dest.addpage(pdf_page)
            if offset:
                offset += 1
                page.move_index(new_doc, offset)

        dest.write(dest_path)
        self.drop_cache()

        os.rename(pdf_a_name, pdf_r_name)
        return new_docs
Пример #13
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     doc = PdfDoc(config.workdir)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         docsearch.index_page(page)
     return (doc, doc.pages[0])
Пример #14
0
 def import_doc(self, file_uri, config, docsearch, current_doc=None):
     doc = PdfDoc(config.workdir)
     doc.import_pdf(config, file_uri)
     for page in doc.pages:
         docsearch.index_page(page)
     return (doc, doc.pages[0])