Exemplo n.º 1
0
def pdf_redact(input_stream, output_directory, strings_to_filter):

    path, filename = os.path.split(input_stream)

    output_stream = output_directory  + filename
    
    options = pdf_redactor.RedactorOptions()

    options.input_stream = input_stream
    options.output_stream = output_stream

    # Clear any XMP metadata, if present.
    options.xmp_filters = [lambda xml : None]

    # Redact things that look like social security numbers, replacing the
    # text with X's.
    options.content_filters = [
            (
                re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{0,8})"),
                lambda m : 10 * "X"
            ),
            (
                re.compile(r'[,a-zA-Z0-9 ]+[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}'),
                lambda m : 15 * "X"
            ),
            (
                re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'),
                lambda m : 10 * "X"
            ),
            (
                re.compile("Park,|Road,|Hill,|Lane,|London,|Avenue,|Essex,|Green,|Way|Bristol,|Manchester,"),
                lambda m : 8 * "X"
            ),
            (
                re.compile(r'[\d]{1,2} +[a-zA-Z]{1,15} +(?:Park|Road|Hill|Lane|London|Green|Avenue|Green|Way)'),
                lambda m : 8 * "X"
            ),
            (
                re.compile("\+? {0,2}\d+ {0,2}[(-]?\d(?:[ \d]*\d)?[)-]? {0,2}\d+[/ -]?\d+[/ -]?\d+(?: *- *\d+)?"),
                lambda m : 10 * "X"
            ),         
        ]
        
    for string in strings_to_filter:

        options.content_filters.append(
            (
                re.compile(string),
                lambda m : 4*"X"
            ),
        )    
    # Perform the redaction using PDF on standard input and writing to standard output.
    pdf_redactor.redactor(options)
    
    
    return output_stream
Exemplo n.º 2
0
	def __enter__(self):
		self.input_file = open(self.input_path, "rb")
		self.options.input_stream = self.input_file

		fd, self.redacted_path = tempfile.mkstemp(".pdf")
		self.redacted_file = os.fdopen(fd, "wb")
		self.options.output_stream = self.redacted_file

		pdf_redactor.redactor(self.options)
		self.redacted_file.close()

		return self.redacted_path
Exemplo n.º 3
0
def smoke_test_file(path):
    options = pdf_redactor.RedactorOptions()
    options.input_stream = open(path, "rb")
    options.output_stream = io.BytesIO()
    options.content_filters = [(re.compile("\w+"),
                                lambda match: match.group(0))]
    options.metadata_filters = {"ALL": [metadata_filter]}
    try:
        pdf_redactor.redactor(options)
    except (pdfrw.errors.PdfParseError, IndexError, AssertionError,
            xml.etree.ElementTree.ParseError, TypeError, AttributeError,
            StopIteration, ValueError) as e:
        print("{0} while reading {1}".format(e.__class__.__name__, path),
              file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    finally:
        options.input_stream.close()
def clean_pdf(in_file, out_file, file_metadata, author_names):
    from pdf_redactor import redactor, RedactorOptions
    import io, re, subprocess, tempfile, shutil

    # Form a regex for author names, replacing spaces with optional whitespace.

    author_name_regex = "|".join(
        r"\s?".join(re.escape(an1) for an1 in an.split(" "))
        for an in author_names
    )

    # Set redaction options.

    redactor_options = RedactorOptions()

    redactor_options.metadata_filters = {
        # Copy from report metadata.
        "Title": [lambda value : file_metadata['title']],
        "Author": [lambda value : "Congressional Research Service, Library of Congress, USA"],
        "CreationDate": [lambda value : file_metadata['date']],

        # Set these.
        "Producer": [lambda value : "EveryCRSReport.com"],
        "ModDate": [lambda value : datetime.datetime.utcnow()],

        # Clear all other fields.
        "DEFAULT": [lambda value : None],
    }

    # Clear XMP.
    redactor_options.xmp_filters = [lambda xml : None]

    # Redact phone numbers, email addresses, and author names.
    # See the notes on the regular expressions above for the HTML scrubber.
    redactor_options.content_filters = [
        (re.compile("((^|[^\d])7-)\d{4}"), lambda m : m.group(1) + "...."), # use a symbol likely to be available
        (re.compile("\(\d\d\d\) \d\d\d-\d\d\d\d"), lambda m : "[redacted]"), # use a symbol likely to be available
        (re.compile("[a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~]+(@crs.?(loc|gov))"), lambda m : ("[redacted]" + m.group(1))),
        (re.compile(author_name_regex), lambda m : "(name redacted)"),
    ]

    # Avoid inserting ?'s and spaces.
    redactor_options.content_replacement_glyphs = ['#', '*', '/', '-']

    # Run qpdf to decompress.

    data = subprocess.check_output(['qpdf', '--qdf', '--stream-data=uncompress', in_file, "-"])

    with tempfile.NamedTemporaryFile() as f1:
        with tempfile.NamedTemporaryFile() as f2:

            # Run the redactor. Since qpdf in the next step requires an actual file for the input,
            # write the output to a file.
            redactor_options.input_stream = io.BytesIO(data)
            redactor_options.output_stream = f1
            try:
                redactor(redactor_options)
            except:
                # The redactor has some trouble on old files. Post them anyway.
                if file_metadata['date'] < "2003-01-01":
                    print("Writing", out_file, "without redacting.")
                    f1.seek(0)
                    f1.write(data)
                else:
                    raise
            f1.flush()

            # Linearize and add our own page to the end of the PDF. The qpdf command
            # for this is pretty weird. All we're doing is appending a page.
            import subprocess
            subprocess.check_call(['qpdf', '--linearize', f1.name,
                "--pages", f1.name, "branding/pdf-addendum-page.pdf", "--",
                f2.name])

            # Copy the final PDF to the output location. We don't write directly to
            # out_file in the previous qpdf step in case of errors. If there's an
            # error during writing, let's not leave a broken file.
            shutil.copyfile(f2.name, out_file)

    # Generate a thumbnail image of the PDF.
    # Note that pdftoppm adds ".png" to the end of the file name.
    subprocess.check_call(['pdftoppm', '-png', '-singlefile',
                           '-scale-to-x', '600', '-scale-to-y', '-1',
                           out_file, out_file.replace(".pdf", "")])
Exemplo n.º 5
0
# Example file to print the text layer of a PDF.

import re, io, sys

import pdf_redactor

## Set options.


def printer(m):
    s = m.group(0)
    if sys.version_info < (3, ):
        s = s.encode("utf8")
    print(s)
    return ""


options = pdf_redactor.RedactorOptions()
options.output_stream = io.BytesIO()  # null
options.content_filters = [(re.compile("[\w\W]+"), printer)]
pdf_redactor.redactor(options)
Exemplo n.º 6
0
def redact(fname, searchlist):
    """Lets Redact the pdf
    """
    #fname = sys.argv[1]  # filename

    doc = fitz.open(fname)

    doc.setMetadata({})    # clear metadata
    doc._delXmlMetadata()  # clear any XML metadata

    new_doc = False  # indicator if anything found at all

    for page in doc:  # scan through the pages

        for word in searchlist:
            print(f"Redacting word {word} in document {doc.name}")
            found = mark_word(page, word)  # mark the page's words
            if found:  # if anything found ...
                new_doc = True
                print("found '%s' %i times on page %i" % (word, found, page.number + 1))

    if new_doc:
        doc.save("marked-" + str(doc.name).split('/')[-1])


    import re
    from datetime import datetime
    import pdf_redactor

    ## Set options.

    options = pdf_redactor.RedactorOptions()
    options.metadata_filters = {
        # Perform some field filtering --- turn the Title into uppercase.
        "Title": [lambda value: value.upper()],

        # Set some values, overriding any value present in the PDF.
        "Producer": [lambda value: "My Name"],
        "CreationDate": [lambda value: datetime.utcnow()],

        # Clear all other fields.
        "DEFAULT": [lambda value: None],
    }

    # Clear any XMP metadata, if present.
    options.xmp_filters = [lambda xml: None]

    # Redact things that look like social security numbers, replacing the
    # text with X's.
    options.content_filters = [
        # First convert all dash-like characters to dashes.
        (
            # re.compile(u"[−–—~‐]"),
            # lambda m : "-"
            re.compile(u"LibreOffice"),
            lambda m: "X"
        ),

        # Then do an actual SSL regex.
        # See https://github.com/opendata/SSN-Redaction for why this regex is complicated.
        # (
        # 	re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"),
        # 	lambda m : "XXX-XX-XXXX"
        # ),

        # Content filter that runs on the text comment annotation body.
        (
            re.compile(r"comment!"),
            lambda m: "annotation?"
        )#,
    ]

    # Filter the link target URI.
    options.link_filters = [
        lambda href, annotation: "https://www.google.com"
    ]

    # Perform the redaction using PDF on standard input and writing to standard output.
    pdf_redactor.redactor(options)






# THE SEARCH STRING LIST
# searchlist = ['Marcus', 'Hibell', 'Lorem', 'Hampden-Sydney', 'College', 'loves']
# redact('Lorem.pdf',searchlist)









# WORKING VERSION FOR A SINGLE WORD
# fname = sys.argv[1]                    # filename
# text = sys.argv[2]                     # search string
# doc = fitz.open(fname)

# print("underlining words containing '%s' in document '%s'" % (text, doc.name))

# new_doc = False                        # indicator if anything found at all

# for page in doc:                       # scan through the pages
#     found = mark_word(page, text)      # mark the page's words
#     if found:                          # if anything found ...
#         new_doc = True
#         print("found '%s' %i times on page %i" % (text, found, page.number + 1))

# if new_doc:
#     doc.save("marked-" + str(doc.name).split('/')[-1])