示例#1
0
    def test_right_to_left(self):
        doc = Document("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.pdf")
        text = ""
        for page in doc:
            text += ' \n'.join(page.lines).strip()

        with open("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.txt", "r") as f:
            correct = f.read()
        assert correct == text
示例#2
0
 def ingest(self, file_path, entity):
     """Ingestor implementation."""
     try:
         pdf = Document(bytes(file_path))
     except Exception as ex:
         raise ProcessingException("Could not extract PDF file: %r" % ex) from ex  # noqa
     self.extract_metadata(pdf, entity)
     self.extract_xmp_metadata(pdf, entity)
     self.pdf_extract(entity, pdf)
示例#3
0
文件: pdf.py 项目: x0rzkov/aleph
 def pdf_alternative_extract(self, entity, pdf_path):
     checksum = self.manager.store(pdf_path)
     entity.set('pdfHash', checksum)
     pdf = Document(bytes(pdf_path))
     self.pdf_extract(entity, pdf)
parser = argparse.ArgumentParser(
    description="checks for presence of absence of text on images")
parser.add_argument("input_files", type=str, nargs="+", help="path to a PDF")
parser.add_argument(
    "--threshold",
    type=int,
    default=0,
    help="maximum number of chars to consider a page empty",
)
parser.add_argument("--absence",
                    action="store_true",
                    help="returnes pages without text")

args = parser.parse_args()

for input_file in args.input_files:
    doc = Document(input_file)
    output = []
    num_pages = 0

    for idx, page in enumerate(doc):
        num_pages += 1
        num_chars = sum(map(len, page.lines))
        if num_chars > args.threshold:
            output.append(idx + 1)  # 1-based for PDFs

    if args.absence:
        output = list(set(range(1, num_pages + 1)).difference(set(output)))

    print(" ".join(map(str, output)))
示例#5
0
 def test_empty_pdf(self):
     with pytest.raises(IOError):
         Document("tests/resources/empty.pdf")
示例#6
0
 def test_non_pdf_file(self):
     with pytest.raises(IOError):
         Document("tests/resources/not-pdf.txt")
示例#7
0
 def test_directory_path(self):
     with pytest.raises(IOError):
         Document("test/resources/")
示例#8
0
 def test_non_existent_file(self):
     with pytest.raises(IOError):
         Document("tests/resources/not-exists.pdf")
示例#9
0
 def test_extract_metadata(self):
     doc = Document("tests/resources/FAC.pdf")
     assert doc.metadata
     assert doc.xmp_metadata
示例#10
0
 def test_extract_text(self):
     doc = Document("tests/resources/prop.pdf")
     text = ""
     for page in doc:
         text += ' \n'.join(page.lines).strip()
     assert "Milestones" in text
示例#11
0
 def test_bytes_paths(self):
     self._clean_images()
     doc = Document(b"tests/resources/FAC.pdf")
     doc.extract_images(path=b"tests/images", prefix="img")
     assert os.path.exists("tests/images")
     assert len(glob.glob(os.path.join("tests/images", "*.png"))) == 4
示例#12
0
 def test_extract_images(self, path, no_imgs):
     self._clean_images()
     doc = Document(path)
     doc.extract_images(path="tests/images", prefix="img")
     assert os.path.exists("tests/images")
     assert len(glob.glob(os.path.join("tests/images", "*.png"))) == no_imgs
示例#13
0
文件: pdf.py 项目: slad99/ingestors
 def pdf_alternative_extract(self, pdf_path):
     self.result.emit_pdf_alternative(pdf_path)
     pdf = Document(pdf_path.encode('utf-8'))
     self.pdf_extract(pdf)
示例#14
0
 def pdf_alternative_extract(self, pdf_path):
     pdf = Document(pdf_path.encode('utf-8'))
     self.pdf_extract(pdf)