Python Document示例

编程语言: Python

命名空间/包名称: pdflib

类/类型: Document

hotexamples.com的示例: 14

Python Document - 已找到14个示例。这些是从开源项目中提取的最受好评的pdflib.Document现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Document(14)

extract_images(2)

示例#1

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

    def test_right_to_left(self):
        doc = Document("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.pdf")
        text = ""
        for page in doc:
            text += ' \n'.join(page.lines).strip()

        with open("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.txt", "r") as f:
            correct = f.read()
        assert correct == text

示例#2

显示文件

 def ingest(self, file_path, entity):
     """Ingestor implementation."""
     try:
         pdf = Document(bytes(file_path))
     except Exception as ex:
         raise ProcessingException("Could not extract PDF file: %r" % ex) from ex  # noqa
     self.extract_metadata(pdf, entity)
     self.extract_xmp_metadata(pdf, entity)
     self.pdf_extract(entity, pdf)

示例#3

显示文件

文件： pdf.py 项目： x0rzkov/aleph

 def pdf_alternative_extract(self, entity, pdf_path):
     checksum = self.manager.store(pdf_path)
     entity.set('pdfHash', checksum)
     pdf = Document(bytes(pdf_path))
     self.pdf_extract(entity, pdf)

示例#4

显示文件

文件： pages_with_text.py 项目： code-for-magdeburg/pdf-scripts

parser = argparse.ArgumentParser(
    description="checks for presence of absence of text on images")
parser.add_argument("input_files", type=str, nargs="+", help="path to a PDF")
parser.add_argument(
    "--threshold",
    type=int,
    default=0,
    help="maximum number of chars to consider a page empty",
)
parser.add_argument("--absence",
                    action="store_true",
                    help="returnes pages without text")

args = parser.parse_args()

for input_file in args.input_files:
    doc = Document(input_file)
    output = []
    num_pages = 0

    for idx, page in enumerate(doc):
        num_pages += 1
        num_chars = sum(map(len, page.lines))
        if num_chars > args.threshold:
            output.append(idx + 1)  # 1-based for PDFs

    if args.absence:
        output = list(set(range(1, num_pages + 1)).difference(set(output)))

    print(" ".join(map(str, output)))

示例#5

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_empty_pdf(self):
     with pytest.raises(IOError):
         Document("tests/resources/empty.pdf")

示例#6

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_non_pdf_file(self):
     with pytest.raises(IOError):
         Document("tests/resources/not-pdf.txt")

示例#7

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_directory_path(self):
     with pytest.raises(IOError):
         Document("test/resources/")

示例#8

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_non_existent_file(self):
     with pytest.raises(IOError):
         Document("tests/resources/not-exists.pdf")

示例#9

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_extract_metadata(self):
     doc = Document("tests/resources/FAC.pdf")
     assert doc.metadata
     assert doc.xmp_metadata

示例#10

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_extract_text(self):
     doc = Document("tests/resources/prop.pdf")
     text = ""
     for page in doc:
         text += ' \n'.join(page.lines).strip()
     assert "Milestones" in text

示例#11

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_bytes_paths(self):
     self._clean_images()
     doc = Document(b"tests/resources/FAC.pdf")
     doc.extract_images(path=b"tests/images", prefix="img")
     assert os.path.exists("tests/images")
     assert len(glob.glob(os.path.join("tests/images", "*.png"))) == 4

示例#12

显示文件

文件： test_pdflib.py 项目： wayne9qiu/pdflib

 def test_extract_images(self, path, no_imgs):
     self._clean_images()
     doc = Document(path)
     doc.extract_images(path="tests/images", prefix="img")
     assert os.path.exists("tests/images")
     assert len(glob.glob(os.path.join("tests/images", "*.png"))) == no_imgs

示例#13

显示文件

文件： pdf.py 项目： slad99/ingestors

 def pdf_alternative_extract(self, pdf_path):
     self.result.emit_pdf_alternative(pdf_path)
     pdf = Document(pdf_path.encode('utf-8'))
     self.pdf_extract(pdf)

示例#14

显示文件

文件： pdf.py 项目： bobquest33/ingestors

 def pdf_alternative_extract(self, pdf_path):
     pdf = Document(pdf_path.encode('utf-8'))
     self.pdf_extract(pdf)