def get_file_mimetype(file): try: mimeTypeFile = exiftool.ExifToolHelper().get_metadata(file.name)[0]["File:MIMEType"] if mimeTypeFile == "application/pdf": #Check is PDFA and Version with open(file.name, mode="rb") as fileData: input = PdfFileReader(fileData, strict=False) try: metadata = input.getXmpMetadata() if metadata: pdfa=app.config["PDFA"] nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"]) if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]: mimeTypeFile = "application/pdfa" except (ExpatError): app.logger.log(logging.WARNING, "File {0} has not well-formed XMP data, could not verify if application/pdf has PDF/A1 DOCINFO.".format(file.name)) elif mimeTypeFile in app.config["GENERIC_MIMETYPES"]: mimeTypeFile = magic.from_file(file.name, mime=True) if mimeTypeFile in app.config["GENERIC_MIMETYPES"]: with open(file.name, mode="rb") as fileData: documentTypeFile = magic.from_buffer(fileData.read(2048)) for (fileMimetype, fileFormat) in itertools.zip_longest(app.config["FILEMIMETYPES"], app.config["FILEFORMATS"]): if documentTypeFile in fileFormat: mimeTypeFile = fileMimetype except (ValueError, PdfReadError): mimeTypeFile = "Unknown/Corrupted" return mimeTypeFile
def test_PdfReaderJpegImage(self): ''' Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. ''' with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile: # Load PDF file from file ipdf = PdfFileReader(inputfile) # Retrieve the text of the image with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file: imagetext = pdftext_file.read() ipdf_p0 = ipdf.getPage(0) xObject = ipdf_p0['/Resources']['/XObject'].getObject() data = xObject['/Im4'].getData() # Compare the text of the PDF to a known source self.assertEqual( binascii.hexlify(data).decode(), imagetext, msg= 'PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (imagetext, binascii.hexlify(data).decode()))
def extract_information(pdf_path, pagenum): testread = "" with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() testread = pdf.getPage(pagenum).extractText().strip() #print(pdf.getPage(pagenum).extractText().strip()) number_of_pages = pdf.getNumPages() # txt = f""" # Information about {pdf_path}: # Author: {information.author} # Creator: {information.creator} # Producer: {information.producer} # Subject: {information.subject} # Title: {information.title} # Number of pages: {number_of_pages} # """ print(testread) # define variables s = testread.strip() file = "file.mp3" # initialize tts, create mp3 and play tts = gTTS(s, 'en') tts.save(file) #os.system("mpg123 " + file) return information
def PDFMerge(savePath, pdfPath, watermarkPdfPath): # pdf파일 불러오기 pdfFile = open(pdfPath, 'rb') pdfReader = PdfFileReader(pdfFile, strict=False) # 워터마크 PDF파일 불러오기 watermarkPdfFile = open(watermarkPdfPath, 'rb') watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0) pdfWriter = PdfFileWriter() #PDF 페이지 수만큼 반복 for pageNum in range(pdfReader.numPages): #페이지를 불러온다 pageObj = pdfReader.getPage(pageNum) #중앙으로 놓기 위해 좌표를 구한다 x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2 y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2 # 워터마크페이지와 합친다 pageObj.mergeTranslatedPage(page2=watermarkPdf, tx=x, ty=y, expand=False) #합친걸 저장할 PDF파일에 추가한다 pdfWriter.addPage(pageObj) #저장 resultFile = open(savePath, 'wb') pdfWriter.write(resultFile)
def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True): if type(make_backup) is str: bak_file = make_backup else: bak_file = os.path.splitext(pdf_file)[0] + ".bak" os.rename(pdf_file, bak_file) with open(bak_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo if not substitute_all_metadata: for key in info: #infoDict.update({NameObject(key): createStringObject(info[key])}) infoDict.update({key: info[key]}) for key in metadata: infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))}) with open(pdf_file, 'wb') as fout: writer.write(fout) if make_backup == False: os.unlink(bak_file)
def get_reader(filename, password): global old_file try: old_file = open(filename, 'rb') except Exception as err: print('文件打开失败!' + str(err)) write_result(filename, '文件打开失败') return None # 创建读实例 pdf_reader = PdfFileReader(old_file, strict=False) # 解密操作 if pdf_reader.isEncrypted: if password is None: print('%s文件被加密,需要密码!' % filename) write_result(filename, '文件需要密码') return None else: if pdf_reader.decrypt(password) != 1: print('%s密码不正确!' % filename) return None # if 'old_file' in locals(): # 这句话需要使用字符串格式,否则无法关闭文件 # old_file.close() return pdf_reader
def pdf_mediabox(filename): pdf = PdfFileReader(open(filename, 'rb')) page = pdf.getPage(0).mediaBox width = page.getWidth() height = page.getHeight() return result_line(filename, round(float(height) * points_to_mm), round(float(width) * points_to_mm))
def split(file): """ Esse método irá separar página por página do arquivo que o usuário escolher e as salvar no 'output directory' como novos arquivos pdf. Cada arquivo corresponderá à uma página do documento original. :param file: O arquivo escolhido pelo usuário para fazer a separação das páginas :return: None """ # Limpando o diretório para evitar duplicidade em arquivos/diretórios Splitter.cleanDir() # Tratando o nome do arquivo file = Merger.toPath(file) # Lógica para separação das páginas dos arquivos PDF's e nova nomeclatura para os mesmos with open(file, mode='rb') as pdf_file_to_read: file_length = PdfFileReader(pdf_file_to_read).numPages for page in range(file_length): pdf_file = PdfFileReader(pdf_file_to_read) current_page = PdfFileWriter() current_page.addPage(pdf_file.getPage(page)) with open(join(Splitter.splitter_dir, f"página_{page + 1}.pdf"), mode='wb') as pdf: current_page.write(pdf)
def test_PdfReaderFileLoad(self): ''' Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. ''' with open(os.path.join(RESOURCE_ROOT, 'crazyones.pdf'), 'rb') as inputfile: # Load PDF file from file ipdf = PdfFileReader(inputfile) ipdf_p1 = ipdf.getPage(0) # Retrieve the text of the PDF with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'rb') as pdftext_file: pdftext = pdftext_file.read() ipdf_p1_text = ipdf_p1.extractText().replace('\n', '').encode('utf-8') # Compare the text of the PDF to a known source self.assertEqual( ipdf_p1_text, pdftext, msg= 'PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (pdftext, ipdf_p1_text))
def check_file_content(original, converted): original_pdf = PdfFileReader(open(original, mode="rb"), strict=False) original_page_num = original_pdf.numPages with open(converted, mode="rb") as converted_data: converted_pdf = PdfFileReader(converted_data, strict=False) page = PageObject(converted_data) if (page.getContents() is None or original_page_num != converted_pdf.numPages): return False return True
def start_Encryption(self): global filename try: if self.filename[0] and self.userPassword.text( ) and self.ownerPassword.text(): pfw = PdfFileWriter() pdffile = PdfFileReader(self.filename[0]) total_pages = pdffile.numPages for page in range(total_pages): current_page = pdffile.getPage(page) pfw.addPage(current_page) pfw.encrypt(self.userPassword.text(), self.ownerPassword.text()) file = open(self.filename[0].replace('.pdf', '_encrypted.pdf'), 'wb') pfw.write(file) file.close() msg = QMessageBox() msg.setWindowTitle('Done') msg.setIcon(QMessageBox.Information) msg.setText('File encryption done successfully.') msg.exec_() self.filename = '' self.userPassword.setText('') self.ownerPassword.setText('') self.userPassword.setDisabled(True) self.ownerPassword.setDisabled(True) self.startEncryption.setDisabled(True) else: if self.ownerPassword.text() == '': msg = QMessageBox() msg.setWindowTitle('Error') msg.setIcon(QMessageBox.Critical) msg.setText('Owner Password Field is Empty.') msg.exec_() if self.userPassword.text() == '': msg = QMessageBox() msg.setWindowTitle('Error') msg.setIcon(QMessageBox.Critical) msg.setText('User Password Field is Empty.') msg.exec_() except: pass
def pdf_metadata_load(pdf_file): with open(pdf_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() return pdf_in.documentInfo
def pdf_meta(tmp_file_path, original_file_name, original_file_extension): doc_info = None xmp_info = None if use_pdf_meta: with open(tmp_file_path, 'rb') as f: pdf_file = PdfFileReader(f) doc_info = pdf_file.getDocumentInfo() xmp_info = parse_xmp(pdf_file) if xmp_info: author = ' & '.join(split_authors(xmp_info['author'])) title = xmp_info['title'] subject = xmp_info['subject'] tags = xmp_info['tags'] languages = xmp_info['languages'] publisher = xmp_info['publisher'] else: author = u'Unknown' title = '' languages = [""] publisher = "" subject = "" tags = "" if doc_info: if author == '': author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown' if title == '': title = doc_info.title if doc_info.title else original_file_name if subject == '': subject = doc_info.subject or "" if tags == '' and '/Keywords' in doc_info: if isinstance(doc_info['/Keywords'], bytes): tags = doc_info['/Keywords'].decode('utf-8') else: tags = doc_info['/Keywords'] else: title = original_file_name return BookMeta( file_path=tmp_file_path, extension=original_file_extension, title=title, author=author, cover=pdf_preview(tmp_file_path, original_file_name), description=subject, tags=tags, series="", series_id="", languages=','.join(languages), publisher=publisher, pubdate="", identifiers=[])
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out)
def pypdf3(self): with open(self.file_name, 'rb') as pdf_in: pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(pdf_in) for pagenum in range(pdf_reader.numPages): page = pdf_reader.getPage(pagenum) page.rotateClockwise(self.rotation) pdf_writer.addPage(page) with open(self.outfn, 'wb') as pdf_out: pdf_writer.write(pdf_out) return self.outfn
def has_PDFA_XMP(file): try: with open(file, mode="rb") as fileData: xmpfile = PdfFileReader(fileData, strict=False) metadata = xmpfile.getXmpMetadata() if metadata is not None: pdfa=app.config["PDFA"] nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"]) if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]: return True return False except: return False
def pypdf3_reader(pdf, decrypt=None): """ Retrieve a PdfFileReader object that has been decrypted if a password is specified. :param pdf: PDF document to read :param decrypt: Owner password to decrypt pdf :return: PdfFileReader object """ if decrypt: reader = PdfFileReader(pdf) reader.decrypt(decrypt) return reader else: return PdfFileReader(pdf)
def browse_PDF(self): try: global filename self.filename=QtWidgets.QFileDialog.getOpenFileName(None,'Select PDF File','/','PDF File (*.pdf)') if self.filename[0] and PdfFileReader(self.filename[0]).getIsEncrypted()==False: self.convertPDF.setDisabled(False) else: if PdfFileReader(self.filename[0]).getIsEncrypted()==False: msg=QMessageBox() msg.setWindowTitle('Error') msg.setText('File curropted or encrypted.') msg.setIcon(3) msg.exec_() except: pass
def _extract(self, report_path) -> pd.DataFrame: num_rows = 11 num_columns = len(INPUT_COLUMNS) pdf = PdfFileReader(str(report_path)) date = extract_datetime(extract_text(pdf, page=0)) page, _ = find_table_page(pdf) page = self.unknown_age_matcher.sub("unknown", page) data_start = page.find("0-9") # on 2020-09-28, they wrote floats like "1, 5" raw_data = page[data_start:].replace(", ", ",") tokens = raw_data.split() # In some cases, PyPDF3 doesn't read the token "≥90" (probably a bug), # so I insert that manually in case is missing. Couldn't the token in # that position be "90" by coincidence? Nope. If "≥90" is missing, the # token in that position is the cumulative total of cases with age >= 90 # which has never been equal to 90 (and never will be). if tokens[9 * num_columns] not in {"90", ">90", "≥90"}: tokens.insert(9 * num_columns, ">=90") rows = [] for i in range(num_rows): start = i * num_columns end = start + num_columns row_tokens = tokens[start:end] try: values = convert_values(row_tokens, COLUMN_CONVERTERS) except ValueError or TypeError as err: logger.debug('Error in row %d: ') raise TableExtractionError( f"\nError while converting values of row {i}: {err}.\n" f"Row tokens: {' | '.join(row_tokens)}") row = [date, *values] rows.append(row) report_data = pd.DataFrame(rows, columns=["date", *INPUT_COLUMNS]) return report_data
def extract(self, report_path) -> pd.DataFrame: pdf = PdfFileReader(str(report_path)) date = extract_datetime(extract_text(pdf, page=0)) page, _ = find_table_page(pdf) page = self.unknown_age_matcher.sub('unknown', page) data_start = page.find('0-9') raw_data = page[data_start:] raw_data = raw_data.replace( ', ', ',') # from 28/09, they write "1,5" as "1, 5" tokens = raw_data.split(' ') num_rows = 11 num_columns = len(INPUT_COLUMNS) rows = [] for i in range(num_rows): data_start = i * num_columns end = data_start + num_columns values = convert_values(tokens[data_start:end], COLUMN_CONVERTERS) row = [date, *values] rows.append(row) report_data = pd.DataFrame(rows, columns=['date', *INPUT_COLUMNS]) report_data = normalize_table(report_data) output_data = compute_derived_columns(report_data) check_recomputed_columns_match_extracted_ones( # sanity check extracted=report_data, recomputed=output_data) return output_data
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() or {} #print(str(meta)) result = {} for key in meta.keys(): result[key[1:]] = meta.get(key) return result
def split_pdf(myfile): pdf_in_file = open('/tmp/' + myfile, 'rb') inputpdf = PdfFileReader(pdf_in_file) pages_no = inputpdf.numPages print(pages_no) output = PdfFileWriter() for i in range(pages_no // 50): output.addPage(inputpdf.getPage(i * 50)) if i * 50 + 1 < inputpdf.numPages: output.addPage(inputpdf.getPage(i * 50 + 1)) print('/tmp/document-page%s.pdf' % i) newname = 'document-page%s.pdf' % i print(newname) with open("/tmp/document-page%s.pdf" % i, "wb") as outputStream: output.write(outputStream) client.upload_file('/tmp/' + newname, destbucketName, 'extracted-pdf/' + newname)
def start(): from PyPDF3 import PdfFileReader import glob print("Put PDF file in pdfs/") print("Which PDF file would you like to read the meta data for?") for d in glob.iglob("pdfs/*"): if "emptyfile" not in d: print(d.replace("pdfs/")) ans = str(input("> ")) if ".pdf" in ans: pass else: ans = ans + ".pdf" pdffile = PdfFileReader(file=(ans, 'rb')) docInfo = pdffile.getDocumentInfo() for metaItem in docInfo: print("- " + metaItem + ":" + docInfo[metaItem]) print("\n")
def reorder(input_filename: str, output_filename: str) -> None: assert os.path.exists(input_filename) assert os.path.exists(output_filename) is False input_stream = open(input_filename, 'rb') output = PdfFileWriter() input_pdf = PdfFileReader(input_stream) pages = input_pdf.getNumPages() order = _make_sequence(pages) for page_number in order: page = input_pdf.getPage(page_number) output.addPage(page) output_stream = open(output_filename, "wb") output.write(output_stream) input_stream.close() output_stream.close()
def pypdf3(self): reader = PdfFileReader(self.file_name) writer = PdfFileWriter() # Number of pages in input document page_count = reader.getNumPages() for page_number in range(page_count): wtrmrk = reader.getPage(page_number) page = PageObject.createBlankPage(width=self.target_w, height=self.target_h) page.mergeScaledTranslatedPage(wtrmrk, self.scale, self.margin_x, self.margin_y) writer.addPage(page) with open(self.output, "wb") as outputStream: writer.write(outputStream) return self.output
def main(): if (len(sys.argv) != 3): print("usage: python 2-up.py input_file output_file") sys.exit(1) print("2-up input " + sys.argv[1]) input1 = PdfFileReader(open(sys.argv[1], "rb")) output = PdfFileWriter() for iter in range(0, input1.getNumPages() - 1, 2): lhs = input1.getPage(iter) rhs = input1.getPage(iter + 1) lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True) output.addPage(lhs) print(str(iter) + " "), sys.stdout.flush() print("writing " + sys.argv[2]) outputStream = file(sys.argv[2], "wb") output.write(outputStream) print("done.")
def write_pdf(pdf_obj, destination): """ Write PDF object to file :param pdf_obj: PDF object to be written to file :param destination: Desintation path """ reader = PdfFileReader(pdf_obj) # Create new PDF object writer = PdfFileWriter() page_count = reader.getNumPages() # add the "watermark" (which is the new pdf) on the existing page for page_number in range(page_count): page = reader.getPage(page_number) writer.addPage(page) # finally, write "output" to a real file with open(destination, "wb") as outputStream: writer.write(outputStream)
def prepare(self): # Process PDF input file to raw text file with open(self.inputPath, "rb") as fh: reader = PdfFileReader(fh) for page in tqdm(range(0, reader.numPages)): page_text = reader.getPage(page).extractText() print("Reading page", page, "of", reader.getNumPages()) filename = join_paths("./.TXT", hash(self.inputPath)) with open(filename, "a") as fh: fh.write(page_text) # Cleaning the TEXT file for better processing with open(filename, "r") as fh: lines = fh.readlines() lines = [l.replace("\n", "").replace("\r", "") for l in lines] with open(filename, "w") as fh: fh.writelines(lines) print("Cleaning... => ", filename) self.transform(filename)
def add_encryption(path, encryptPath, fileDicts): pdf_writer = PdfFileWriter() for fileName in fileDicts: input_pdf = os.path.join(path, fileName) output_pdf = os.path.join(encryptPath, fileName) pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=fileDicts[fileName], owner_pwd=None, use_128bit=True) #输出文件已存在便删除 if os.path.exists(output_pdf): os.remove(output_pdf) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh)
def get_reader(filename, password): try: old_file = open(filename, 'rb') except Exception as err: print('文件打开失败!' + str(err)) return None # 创建读实例 pdf_reader = PdfFileReader(old_file, strict=False) # 解密操作 if pdf_reader.isEncrypted: if password is None: print('%s文件被加密,需要密码!' % filename) return None else: if pdf_reader.decrypt(password) != 1: print('%s密码不正确!' % filename) return None if old_file in locals(): old_file.close() return pdf_reader