def get_pdf_text(pdf_path): p = pdfbox.PDFBox() text = p.extract_text(pdf_path, sort=True) text = re.sub(r'[^\x00-\x7F]+','', text) #print("*********** Extracted Text *********\n" + text) #__writeFile(__get_output_file_path(pdf_path, "_unprocessed_text.txt"), text) return text;
def test_extract_images(self): p = pdfbox.PDFBox() with TemporaryDirectory() as output_dir: output_prefix = (Path(output_dir) / 'test').resolve() result = p.extract_images('./test3.pdf', prefix=output_prefix) self.assertTrue('test-1.png' in os.listdir(output_dir))
def test_extract_text(self): p = pdfbox.PDFBox() text = p.extract_text('./test.pdf') if platform == "linux" or platform == "linux2" or platform == "darwin": self.assertEqual(text, 'this is a test PDF\n') elif platform == "win32": self.assertEqual(text, 'this is a test PDF\r\n')
def test_pdf_to_images(self): p = pdfbox.PDFBox() with TemporaryDirectory() as output_dir: output_prefix = (Path(output_dir) / 'test').resolve() result = p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix) self.assertTrue('test1.jpg' in os.listdir(output_dir) and 'test2.jpg' in os.listdir(output_dir))
def test_python_pdfbox_extract_text_generates_correct_output(): file = CONFIG.test_files_dir / 'test.pdf' expected_output = CONFIG.test_files_dir / 'test.txt' p = pdfbox.PDFBox() with TemporaryDirectory() as output_dir: output_path = (Path(output_dir) / f'{file.stem}.txt').resolve() p.extract_text(file, output_path=output_path) assert output_path.exists() assert len(list(Path(output_dir).iterdir())) == 1 assert filecmp.cmp(output_path, expected_output) is True
def extract_pdf(file_path, upload_folder, file_name, *args): print('extracting text from: {0}'.format(file_path)) p = pdfbox.PDFBox() text = p.extract_text(file_path) formatted_text = "<br/>".join(text.splitlines()) translation = translate(formatted_text[0:4000]) formatted_translation = translation.replace('<br/>', '\n') f_name = file_name.split('.')[0] uploaded_file_path = os.path.join(upload_folder, f_name + '_translated.txt') with open(uploaded_file_path, "w") as f: f.write(formatted_translation)
def convert_txt(indir, outdir, pdfname): print("\n") print( ">>>>>>>>>>>>>>>>>>>>>>>> Installing PDFBox... <<<<<<<<<<<<<<<<<<<<<<<<<<<<" ) os.system('pip install python-pdfbox') print("\n") print( ">>>>>>>>>>>>>>>>>>>>>>>> Converting File... <<<<<<<<<<<<<<<<<<<<<<<<<<<<" ) print(" => Inputing a document...") # remove single quotes from file name pdfname = pdfname.replace("'", "") #os.system('bash src/rename.sh') # extract text using PDFBox input_fp = os.path.join(indir, pdfname) temp_txt = input_fp.replace('.pdf', '.txt') temp_txt = temp_txt.replace(' ', '_') p = pdfbox.PDFBox() p.extract_text(input_fp, temp_txt) # make a directory if outdir does not exist command = 'mkdir -p ' + outdir os.system(command) print(" => Converting pdf to txt...") textname = pdfname.replace('.pdf', '_converted.txt') textname = textname.replace(' ', '_') output_fp = os.path.join(outdir, textname) output_txt = open(output_fp, 'w') # concatenate split lines with open(temp_txt, 'rb') as f: for line in f: line = line.decode() if len(line) >= 2 and line[-2] == '-': output_txt.write(line[:-2]) else: output_txt.write(line[:-1] + ' ') output_txt.close() # save output command = 'rm ' + temp_txt os.system(command) print(" => Done! File is saved as '" + output_fp + "'") print("\n") return
def extract(self, filename, method="pdfbox"): """ Extract the raw text of a PDF file using PDFBox or Textract. Default method: PDFBox """ if method == "pdfbox": p = pdfbox.PDFBox() text = p.extract_text(filename) if len(text) == 0: method = "textract" if method == "textract": byte_text = textract.process(filename, encoding="utf-8", method="pdfminer") text = byte_text.decode("utf-8") return text
class PDFBoxExtractor(ITextExtractor): p: pdfbox.PDFBox = pdfbox.PDFBox() encoding: str = 'utf-8' html: bool = False sort: bool = False ignore_beads: bool = False console: bool = False def pdf_to_txt( self, filename: Union[str, os.PathLike], output_folder: Union[str, os.PathLike], first_page: int = 1, last_page: Optional[int] = None, ) -> None: basename = Path(filename).stem # TODO Remove num_pages num_pages = pdf2image.pdfinfo_from_path(filename)['Pages'] if last_page is None or last_page > num_pages: last_page = int(num_pages) # TODO # for page in p.get_pages('filename'): -> sorted list of strings (or list of strings + titles, or markup) for page in range(first_page, last_page + 1): output_filename = Path(output_folder) / f'{basename}_{page:04}.txt' self.p.extract_text( filename, output_path=output_filename, encoding=self.encoding, html=self.html, sort=self.sort, ignore_beads=self.ignore_beads, start_page=page, end_page=page, console=self.console, ) logger.success(f'Extracted: {basename}, pages: {num_pages}') def batch_extract( self, files: List[Path], output_folder: Union[str, os.PathLike], *, first_page: int = 1, last_page: Optional[int] = None, ) -> None: logfile = Path(output_folder) / 'extract.log' if logfile.exists(): files = self._skip_completed(files, logfile) if len(files) == 0: return file_logger = self._add_logger(logfile) total_files = len(files) for i, filename in enumerate(files, start=1): print(f'Processing {filename.stem}\t{i:03}/{total_files}', end='\r') self.pdf_to_txt(filename, output_folder, first_page, last_page) self._remove_logger(file_logger)
def main(): import pdfbox p = pdfbox.PDFBox() text = p.extract_text('./simple1.pdf') print('--------------------') print(text)
# -*- coding: utf-8 -*- import pdfbox from pathlib import Path p = pdfbox.PDFBox() end_tags = [ 'Introduction', 'introduction', 'Introductoin', 'INTRODUCTION', 'Motivation', 'Background', 'motivation', 'background' ] def extract_abstract(filepath, start_tag='Abstract', end_tags=end_tags): """Extract abstract from a PDF-formatted scientific and technological article 从 PDF 格式的科技论文中抽取摘要 """ text = p.extract_text(filepath, 'pdf.txt') res = [] flag = False with open('pdf.txt', 'r', encoding='utf-8') as f: for l in f.readlines(): if start_tag in l: flag = True elif flag: l = l.strip() if l == '': continue for t in end_tags: if t in l or t.lower() in l: if res: res[-1] = res[-1].rstrip()
def setUpClass(cls): cls.p = pdfbox.PDFBox()
def test_init_multiple(self): # Try to initialize and use a second # instance of the class: p2 = pdfbox.PDFBox()
def _generate_image(self): pdfbox.PDFBox().pdf_to_images(self.filename, outputPrefix=self.rootname) self.imagename = str(self.rootname) + '1.jpg'