def test_metadata(self): options = pdf_redactor.RedactorOptions() options.metadata_filters = { "Title": [lambda value: value.replace("test", "sentinel")], "Subject": [lambda value: value[::-1]], "DEFAULT": [lambda value: None], } with RedactFixture(FIXTURE_PATH, options) as redacted_path: metadata = subprocess.check_output(["pdfinfo", redacted_path]) self.assertIn(b"this is a sentinel", metadata) self.assertIn(b"FDP a si", metadata) self.assertNotIn(b"CreationDate", metadata) self.assertNotIn(b"LibreOffice", metadata)
def test_text_ssns(self): options = pdf_redactor.RedactorOptions() options.content_filters = [ ( re.compile(u"[−–—~‐]"), lambda m: "-" ), ( re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"), lambda m: "XXX-XX-XXXX" ), ] with RedactFixture(FIXTURE_PATH, options) as redacted_path: text = pdf_to_text(redacted_path) self.assertIn("Here are some fake SSNs\n\nXXX-XX-XXXX\n--\n\nXXX-XX-XXXX XXX-XX-XXXX\n\nAnd some more with common OCR character substitutions:\nXXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX", text)
def test_xmp(self): options = pdf_redactor.RedactorOptions() options.metadata_filters = { "DEFAULT": [lambda value: None], } def xmp_filter(doc): for elem in doc.iter(): if elem.text == "Writer": elem.text = "Sentinel" return doc options.xmp_filters = [xmp_filter] with RedactFixture(FIXTURE_PATH, options) as redacted_path: metadata = subprocess.check_output(["pdfinfo", "-meta", redacted_path]) self.assertIn(b"Sentinel", metadata) self.assertNotIn(b"Writer", metadata)
def test_comment(self): options = pdf_redactor.RedactorOptions() options.content_filters = [ # replacement for the comment text ( re.compile(re.escape(u"I have a comment!")), lambda m: "all gone" ), # replacement for the comment title ( re.compile(re.escape(u"Unknown Author")), lambda m: "Some Person" ), ] with RedactFixture(FIXTURE_PATH, options) as redacted_path: text = pdf_to_text(redacted_path)
def smoke_test_file(path): options = pdf_redactor.RedactorOptions() options.input_stream = open(path, "rb") options.output_stream = io.BytesIO() options.content_filters = [(re.compile("\w+"), lambda match: match.group(0))] options.metadata_filters = {"ALL": [metadata_filter]} try: pdf_redactor.redactor(options) except (pdfrw.errors.PdfParseError, IndexError, AssertionError, xml.etree.ElementTree.ParseError, TypeError, AttributeError, StopIteration, ValueError) as e: print("{0} while reading {1}".format(e.__class__.__name__, path), file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) finally: options.input_stream.close()
def test_link(self): options = pdf_redactor.RedactorOptions() options.content_filters = [ # replacement for the link text (re.compile(re.escape(u"link to issue #13")), lambda m: "this link was removed"), ] options.link_filters = [ lambda href, annotation: "https://www.google.com" ] with RedactFixture(FIXTURE_PATH, options) as redacted_path: text = pdf_to_text(redacted_path) self.assertNotIn("link to issue #13", text) self.assertIn("this link was re#o#e#", text) # glyph replacements html = pdf_to_html(redacted_path) self.assertNotIn("github", html) self.assertIn('href="https://www.google.com"', html)
# Example file to print the text layer of a PDF. import re, io, sys import pdf_redactor ## Set options. def printer(m): s = m.group(0) if sys.version_info < (3, ): s = s.encode("utf8") print(s) return "" options = pdf_redactor.RedactorOptions() options.output_stream = io.BytesIO() # null options.content_filters = [(re.compile("[\w\W]+"), printer)] pdf_redactor.redactor(options)
def redact(fname, searchlist): """Lets Redact the pdf """ #fname = sys.argv[1] # filename doc = fitz.open(fname) doc.setMetadata({}) # clear metadata doc._delXmlMetadata() # clear any XML metadata new_doc = False # indicator if anything found at all for page in doc: # scan through the pages for word in searchlist: print(f"Redacting word {word} in document {doc.name}") found = mark_word(page, word) # mark the page's words if found: # if anything found ... new_doc = True print("found '%s' %i times on page %i" % (word, found, page.number + 1)) if new_doc: doc.save("marked-" + str(doc.name).split('/')[-1]) import re from datetime import datetime import pdf_redactor ## Set options. options = pdf_redactor.RedactorOptions() options.metadata_filters = { # Perform some field filtering --- turn the Title into uppercase. "Title": [lambda value: value.upper()], # Set some values, overriding any value present in the PDF. "Producer": [lambda value: "My Name"], "CreationDate": [lambda value: datetime.utcnow()], # Clear all other fields. "DEFAULT": [lambda value: None], } # Clear any XMP metadata, if present. options.xmp_filters = [lambda xml: None] # Redact things that look like social security numbers, replacing the # text with X's. options.content_filters = [ # First convert all dash-like characters to dashes. ( # re.compile(u"[−–—~‐]"), # lambda m : "-" re.compile(u"LibreOffice"), lambda m: "X" ), # Then do an actual SSL regex. # See https://github.com/opendata/SSN-Redaction for why this regex is complicated. # ( # re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"), # lambda m : "XXX-XX-XXXX" # ), # Content filter that runs on the text comment annotation body. ( re.compile(r"comment!"), lambda m: "annotation?" )#, ] # Filter the link target URI. options.link_filters = [ lambda href, annotation: "https://www.google.com" ] # Perform the redaction using PDF on standard input and writing to standard output. pdf_redactor.redactor(options) # THE SEARCH STRING LIST # searchlist = ['Marcus', 'Hibell', 'Lorem', 'Hampden-Sydney', 'College', 'loves'] # redact('Lorem.pdf',searchlist) # WORKING VERSION FOR A SINGLE WORD # fname = sys.argv[1] # filename # text = sys.argv[2] # search string # doc = fitz.open(fname) # print("underlining words containing '%s' in document '%s'" % (text, doc.name)) # new_doc = False # indicator if anything found at all # for page in doc: # scan through the pages # found = mark_word(page, text) # mark the page's words # if found: # if anything found ... # new_doc = True # print("found '%s' %i times on page %i" % (text, found, page.number + 1)) # if new_doc: # doc.save("marked-" + str(doc.name).split('/')[-1])