def toUnicode(string, font, fontcache): # This is hard! if not font: # There is no font for this text. Assume Latin-1. return string.decode("Latin-1") elif font.ToUnicode: # Decompress the CMap stream & check that it's not compressed in a way # we can't understand. from pdfrw.uncompress import uncompress as uncompress_streams uncompress_streams([font.ToUnicode]) # Use the CMap, which maps character codes to Unicode code points. if font.ToUnicode.stream not in fontcache: fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode) cmap = fontcache[font.ToUnicode.stream] string = cmap.decode(string) #print(string, end='', file=sys.stderr) #sys.stderr.write(string) return string elif font.Encoding == "/WinAnsiEncoding": return string.decode("cp1252", "replace") elif font.Encoding == "/MacRomanEncoding": return string.decode("mac_roman", "replace") else: return "?"
def toUnicode(string, font, fontcache): # This is hard! if not font: # There is no font for this text. Assume Latin-1. return string.decode("Latin-1") elif font.ToUnicode: # Decompress the CMap stream & check that it's not compressed in a way # we can't understand. from pdfrw.uncompress import uncompress as uncompress_streams uncompress_streams([font.ToUnicode]) # Use the CMap, which maps character codes to Unicode code points. if font.ToUnicode.stream not in fontcache: fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode) cmap = fontcache[font.ToUnicode.stream] try: return cmap.decode(string) except RedactionException: # if there's a redaction exception, try and decode using fall back methods. pass if get_encoding(font) == "/WinAnsiEncoding": return string.decode("cp1252", "replace") if get_encoding(font) == "/MacRomanEncoding": return string.decode("mac_roman", "replace") else: current_app.logger.warning( f'Unrecognised font with encoding {font.Encoding}, may not be able to redact properly' ) return "?"
def __init__(self, cmap): self.bytes_to_unicode = { } self.unicode_to_bytes = { } self.defns = { } self.usecmap = None # Decompress the CMap stream & check that it's not compressed in a way # we can't understand. from pdfrw.uncompress import uncompress as uncompress_streams uncompress_streams([cmap]) #print(cmap.stream, file=sys.stderr) # This is based on https://github.com/euske/pdfminer/blob/master/pdfminer/cmapdb.py. from pdfrw import PdfString, PdfArray in_cmap = False operand_stack = [] codespacerange = [] def code_to_int(code): # decode hex encoding code = code.to_bytes() if sys.version_info < (3,): code = (ord(c) for c in code) from functools import reduce return reduce(lambda x0, x : x0*256 + x, (b for b in code)) def add_mapping(code, char, offset=0): # Is this a mapping for a one-byte or two-byte character code? width = len(codespacerange[0].to_bytes()) assert len(codespacerange[1].to_bytes()) == width if width == 1: # one-byte entry if sys.version_info < (3,): code = chr(code) else: code = bytes([code]) elif width == 2: if sys.version_info < (3,): code = chr(code//256) + chr(code & 255) else: code = bytes([code//256, code & 255]) else: raise ValueError("Invalid code space range %s?" % repr(codespacerange)) # Some range operands take an array. if isinstance(char, PdfArray): char = char[offset] # The Unicode character is given usually as a hex string of one or more # two-byte Unicode code points. if isinstance(char, PdfString): char = char.to_bytes() if sys.version_info < (3,): char = (ord(c) for c in char) c = "" for xh, xl in chunk_pairs(list(char)): c += (chr if sys.version_info >= (3,) else unichr)(xh*256 + xl) char = c if offset > 0: char = char[0:-1] + (chr if sys.version_info >= (3,) else unichr)(ord(char[-1]) + offset) else: assert offset == 0 self.bytes_to_unicode[code] = char self.unicode_to_bytes[char] = code for token in tokenize_streams([cmap.stream]): if token == "begincmap": in_cmap = True operand_stack[:] = [] continue elif token == "endcmap": in_cmap = False continue if not in_cmap: continue if token == "def": name = operand_stack.pop(0) value = operand_stack.pop(0) self.defns[name] = value elif token == "usecmap": self.usecmap = self.pop(0) elif token == "begincodespacerange": operand_stack[:] = [] elif token == "endcodespacerange": codespacerange = [operand_stack.pop(0), operand_stack.pop(0)] elif token in ("begincidrange", "beginbfrange"): operand_stack[:] = [] elif token in ("endcidrange", "endbfrange"): for (code1, code2, cid_or_name1) in chunk_triples(operand_stack): if not isinstance(code1, PdfString) or not isinstance(code2, PdfString): continue code1 = code_to_int(code1) code2 = code_to_int(code2) for code in range(code1, code2+1): add_mapping(code, cid_or_name1, code-code1) operand_stack[:] = [] elif token in ("begincidchar", "beginbfchar"): operand_stack[:] = [] elif token in ("endcidchar", "endbfchar"): for (code, char) in chunk_pairs(operand_stack): if not isinstance(code, PdfString): continue add_mapping(code_to_int(code), char) operand_stack[:] = [] elif token == "beginnotdefrange": operand_stack[:] = [] elif token == "endnotdefrange": operand_stack[:] = [] else: operand_stack.append(token)
def build_text_layer(document): # Within each page's content stream, look for text-showing operators to # find the text content of the page. Construct a string that contains the # entire text content of the document AND a mapping from characters in the # text content to tokens in the content streams. That lets us modify the # tokens in the content streams when we find text that we want to redact. # # The text-showing operators are: # # (text) Tj -- show a string of text # (text) ' -- move to next line and show a string of text # aw ac (text) " -- show a string of text with word/character spacing parameters # [ ... ] TJ -- show text strings from the array, which are interleaved with spacing parameters # # (These operators appear only within BT ... ET so-called "text objects", # although we don't make use of it.) # # But since we don't understand any of the other content stream operators, # and in particular we don't know how many operands each (non-text) operator # takes, we can never be sure whether what we see in the content stream is # an operator or an operand. If we see a "Tj", maybe it is the operand of # some other operator? # # We'll assume we can get by just fine, however, assuming that whenever we # see one of these tokens that it's an operator and not an operand. # # But TJ remains a little tricky because its operand is an array that preceeds # it. Arrays are delimited by square brackets and we need to parse that. # # We also have to be concerned with the encoding of the text content, which # depends on the active font. With a simple font, the text is a string whose # bytes are glyph codes. With a composite font, a CMap maps multi-byte # character codes to glyphs. In either case, we must map glyphs to unicode # characters so that we can pattern match against it. # # To know the active font, we look for the "<font> <size> Tf" operator. from pdfrw import PdfObject, PdfString, PdfArray from pdfrw.uncompress import uncompress as uncompress_streams from pdfrw.objects.pdfname import BasePdfName text_tokens = [] fontcache = { } class TextToken: value = None font = None def __init__(self, value, font): self.font = font self.raw_original_value = value self.original_value = toUnicode(value, font, fontcache) self.value = self.original_value def __str__(self): # __str__ is used for serialization if self.value == self.original_value: # If unchanged, return the raw original value without decoding/encoding. return PdfString.from_bytes(self.raw_original_value) else: # If the value changed, encode it from Unicode according to the encoding # of the font that is active at the location of this token. return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache)) def __repr__(self): # __repr__ is used for debugging return "Token<%s>" % repr(self.value) def process_text(token): if token.value == "": return text_tokens.append(token) # For each page... page_tokens = [] for page in document.pages: # For each token in the content stream... # Remember this page's revised token list. token_list = [] page_tokens.append(token_list) if page.Contents is None: continue prev_token = None prev_prev_token = None current_font = None # The page may have one content stream or an array of content streams. # If an array, they are treated as if they are concatenated into a single # stream (per the spec). if isinstance(page.Contents, PdfArray): contents = list(page.Contents) else: contents = [page.Contents] # If a compression Filter is applied, attempt to un-apply it. If an unrecognized # filter is present, an error is raised. uncompress_streams expects an array of # streams. uncompress_streams(contents) def make_mutable_string_token(token): if isinstance(token, PdfString): token = TextToken(token.to_bytes(), current_font) # Remember all unicode characters seen in this font so we can # avoid inserting characters that the PDF isn't likely to have # a glyph for. if current_font and current_font.BaseFont: fontcache.setdefault(current_font.BaseFont, set()).update(token.value) return token # Iterate through the tokens in the page's content streams. for token in tokenize_streams(content.stream for content in contents): # Replace any string token with our own class that hold a mutable # value, which is how we'll rewrite content. token = make_mutable_string_token(token) # Append the token into a new list that holds all tokens. token_list.append(token) # If the token is an operator and we're not inside an array... if isinstance(token, PdfObject): # And it's one that we recognize, process it. if token in ("Tj", "'", '"') and isinstance(prev_token, TextToken): # Simple text operators. process_text(prev_token) elif token == "TJ" and isinstance(prev_token, PdfArray): # The text array operator. for i in range(len(prev_token)): # (item may not be a string! only the strings are text.) prev_token[i] = make_mutable_string_token(prev_token[i]) if isinstance(prev_token[i], TextToken): process_text(prev_token[i]) elif token == "Tf" and isinstance(prev_prev_token, BasePdfName): # Update the current font. # prev_prev_token holds the font 'name'. The name must be looked up # in the content stream's resource dictionary, which is page.Resources, # plus any resource dictionaries above it in the document hierarchy. current_font = None resources = page.Resources while resources and not current_font: current_font = resources.Font[prev_prev_token] resources = resources.Parent # Remember the previously seen token in case the next operator is a text-showing # operator -- in which case this was the operand. Remember the token before that # because it may be a font name for the Tf operator. prev_prev_token = prev_token prev_token = token return (text_tokens, page_tokens)
def build_text_layer(document, options): from pdfrw import PdfObject, PdfString, PdfArray from pdfrw.uncompress import uncompress as uncompress_streams from pdfrw.objects.pdfname import BasePdfName global text_tokens text_tokens = [] fontcache = {} class TextToken: value = None font = None def __init__(self, value, font): self.font = font self.raw_original_value = value self.original_value = toUnicode(value, font, fontcache) self.value = self.original_value def __str__(self): # __str__ is used for serialization if self.value == self.original_value: # If unchanged, return the raw original value without decoding/encoding. return PdfString.from_bytes(self.raw_original_value) else: # If the value changed, encode it from Unicode according to the encoding # of the font that is active at the location of this token. return PdfString.from_bytes( fromUnicode(self.value, self.font, fontcache, options)) def __repr__(self): # __repr__ is used for debugging #print ("Token<%s>" % repr(self.value)) return "Token<%s>" % repr(self.value) def process_text(token): if token.value == "": return text_tokens.append(token) # For each page... global page_tokens page_tokens = [] for page in document.pages: # For each token in the content stream... # Remember this page's revised token list. token_list = [] page_tokens.append(token_list) if page.Contents is None: continue prev_token = None prev_prev_token = None current_font = None # The page may have one content stream or an array of content streams. # If an array, they are treated as if they are concatenated into a single # stream (per the spec). if isinstance(page.Contents, PdfArray): contents = list(page.Contents) else: contents = [page.Contents] # If a compression Filter is applied, attempt to un-apply it. If an unrecognized # filter is present, an error is raised. uncompress_streams expects an array of # streams. uncompress_streams(contents) def make_mutable_string_token(token): if isinstance(token, PdfString): token = TextToken(token.to_bytes(), current_font) # Remember all unicode characters seen in this font so we can # avoid inserting characters that the PDF isn't likely to have # a glyph for. if current_font and current_font.BaseFont: fontcache.setdefault(current_font.BaseFont, set()).update(token.value) return token # Iterate through the tokens in the page's content streams. for token in tokenize_streams(content.stream for content in contents): # Replace any string token with our own class that hold a mutable # value, which is how we'll rewrite content. token = make_mutable_string_token(token) # Append the token into a new list that holds all tokens. token_list.append(token) # If the token is an operator and we're not inside an array... if isinstance(token, PdfObject): # And it's one that we recognize, process it. if token in ("Tj", "'", '"') and isinstance( prev_token, TextToken): # Simple text operators. process_text(prev_token) elif token == "TJ" and isinstance(prev_token, PdfArray): # The text array operator. for i in range(len(prev_token)): # (item may not be a string! only the strings are text.) prev_token[i] = make_mutable_string_token( prev_token[i]) if isinstance(prev_token[i], TextToken): process_text(prev_token[i]) elif token == "Tf" and isinstance(prev_prev_token, BasePdfName): # Update the current font. # prev_prev_token holds the font 'name'. The name must be looked up # in the content stream's resource dictionary, which is page.Resources, # plus any resource dictionaries above it in the document hierarchy. current_font = None resources = page.Resources while resources and not current_font: current_font = resources.Font[prev_prev_token] resources = resources.Parent # Remember the previously seen token in case the next operator is a text-showing # operator -- in which case this was the operand. Remember the token before that # because it may be a font name for the Tf operator. prev_prev_token = prev_token prev_token = token return (text_tokens, page_tokens)