def insert_page( self, page: Page, index: int = -1) -> "Document": # type: ignore [name-defined] # build XRef if "XRef" not in self: self["XRef"] = PlainTextXREF() # build Trailer if "Trailer" not in self["XRef"]: self["XRef"]["Trailer"] = Dictionary() self["XRef"][Name("Size")] = Decimal(0) # build Root if "Root" not in self["XRef"]["Trailer"]: self["XRef"]["Trailer"][Name("Root")] = Dictionary() # build Pages if "Pages" not in self["XRef"]["Trailer"]["Root"]: self["XRef"]["Trailer"][Name("Root")][Name("Pages")] = Dictionary() self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name( "Count")] = Decimal(0) self["XRef"]["Trailer"][Name("Root")][Name("Pages")][Name( "Kids")] = List() # update /Kids kids = self["XRef"]["Trailer"]["Root"]["Pages"]["Kids"] assert kids is not None assert isinstance(kids, List) kids.insert(index, page) # update /Count prev_count = self["XRef"]["Trailer"]["Root"]["Pages"]["Count"] self["XRef"]["Trailer"]["Root"]["Pages"]["Count"] = Decimal( prev_count + 1) # return return self
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): assert isinstance(object_to_transform, Dictionary) assert context is not None assert context.destination is not None assert context.destination # output value out_value = Dictionary() # objects to turn into reference queue: typing.List[AnyPDFType] = [] for k, v in object_to_transform.items(): if (isinstance(v, Dictionary) or isinstance(v, List) or isinstance(v, Stream) or isinstance(v, Image) ) and v.can_be_referenced(): # type: ignore [union-attr] out_value[k] = self.get_reference(v, context) queue.append(v) else: out_value[k] = v # start object if needed started_object = False ref = object_to_transform.get_reference( ) # type: ignore [attr-defined] if ref is not None: assert isinstance(ref, Reference) assert ref.object_number is not None if ref in context.duplicate_references: logger.debug("skip writing object %d %d R (duplicate)" % (ref.object_number, ref.generation_number or 0)) return if ref.object_number is not None and ref.byte_offset is None: started_object = True self.start_object(object_to_transform, context) context.duplicate_references.append(ref) # write dictionary at current location context.destination.write(bytes("<<", "latin1")) N = len(out_value.items()) for i, (k, v) in enumerate(out_value.items()): self.get_root_transformer().transform(k, context) context.destination.write(bytes(" ", "latin1")) self.get_root_transformer().transform(v, context) if i != N - 1: context.destination.write(bytes(" ", "latin1")) context.destination.write(bytes(">>\n", "latin1")) # end object if needed if started_object: self.end_object(object_to_transform, context) for e in queue: self.get_root_transformer().transform(e, context) # return return out_value
def test_document(self, file) -> bool: # create output directory if it does not exist yet if not self.output_dir.exists(): self.output_dir.mkdir() doc = None with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle) if "XRef" not in doc: return False if "Trailer" not in doc["XRef"]: return False if "Info" not in doc["XRef"]["Trailer"]: doc["XRef"]["Trailer"][Name("Info")] = Dictionary() # change author doc["XRef"]["Trailer"]["Info"]["Author"] = String("Joris Schellekens") # determine output location out_file = self.output_dir / (file.stem + "_out.pdf") with open(out_file, "wb") as pdf_file_handle: PDF.dumps(pdf_file_handle, doc) return True
def test_document(self, file) -> bool: doc = None with open(file, "rb") as pdf_file_handle: doc = None with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle) if "XRef" not in doc: return False if "Trailer" not in doc["XRef"]: return False if "Info" not in doc["XRef"]["Trailer"]: doc["XRef"]["Trailer"][Name("Info")] = Dictionary() # change producer doc["XRef"]["Trailer"]["Info"]["Producer"] = String("pText") # determine output location out_file = self.output_dir / (file.stem + "_out.pdf") with open(out_file, "wb") as pdf_file_handle: PDF.dumps(out_file, doc) return True
def _read_trailer(self, src: io.IOBase, tok: HighLevelTokenizer) -> Dictionary: # return None if there is no trailer token = tok.next_non_comment_token() assert token is not None if token.text != "trailer": return Dictionary() # if there is a keyword "trailer" the next token should be TokenType.START_DICT token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.START_DICT: raise PDFSyntaxError( byte_offset=tok.tell(), message="invalid XREF trailer", ) # go back 2 chars "<<" src.seek(-2, io.SEEK_CUR) # read dictionary as trailer trailer_dict = tok.read_dictionary() # process startxref token = tok.next_non_comment_token() assert token is not None if token.token_type != TokenType.OTHER or token.text != "startxref": raise PDFSyntaxError( byte_offset=token.byte_offset, message="start of XREF not found", ) # return return trailer_dict
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): assert context is not None assert context.destination is not None assert isinstance(object_to_transform, Stream) # start object if needed started_object = False ref = object_to_transform.get_reference( ) # type: ignore [attr-defined] if ref is not None: assert isinstance(ref, Reference) assert ref.object_number is not None if ref in context.duplicate_references: logger.debug("skip writing object %d %d R (duplicate)" % (ref.object_number, ref.generation_number or 0)) return if ref.object_number is not None and ref.byte_offset is None: started_object = True self.start_object(object_to_transform, context) context.duplicate_references.append(ref) # build stream dictionary stream_dictionary = Dictionary() # objects to turn into reference queue: typing.List[AnyPDFType] = [] for k, v in object_to_transform.items(): if k in ["Bytes", "DecodedBytes"]: continue if (isinstance(v, Dictionary) or isinstance(v, List) or isinstance(v, Stream) ) and v.can_be_referenced(): # type: ignore [union-attr] stream_dictionary[k] = self.get_reference(v, context) queue.append(v) else: stream_dictionary[k] = v # write stream dictionary self.get_root_transformer().transform(stream_dictionary, context) # write "stream" context.destination.write(bytes("stream\n", "latin1")) # write bytes context.destination.write(object_to_transform["Bytes"]) # write "endstream" context.destination.write(bytes("\nendstream\n", "latin1")) # end object if needed if started_object: self.end_object(object_to_transform, context) for e in queue: self.get_root_transformer().transform(e, context)
def test_hash_types(self): obj0 = Dictionary() obj0[Name("Root")] = Reference(object_number=10) obj0[Name("Marked")] = Boolean(True) obj1 = List() obj1.append(Name("Red")) obj1.append(Decimal(0.5)) print(hash(obj1))
def transform( self, object_to_transform: Any, context: Optional[WriteTransformerContext] = None, ): # write header assert context is not None assert context.destination is not None context.destination.write(b"%PDF-1.7\n") context.destination.write(b"%") context.destination.write(bytes([226, 227, 207, 211])) context.destination.write(b"\n") # invalidate all references WritePDFTransformer._invalidate_all_references(object_to_transform) # create Info dictionary if needed if "Info" not in object_to_transform["XRef"]["Trailer"]: object_to_transform["XRef"]["Trailer"][Name("Info")] = Dictionary() # set /ID random_id = HexadecimalString("%032x" % random.randrange(16**32)) if "ID" not in object_to_transform["XRef"]["Trailer"]: object_to_transform["XRef"]["Trailer"][Name("ID")] = List( ).set_can_be_referenced( # type: ignore [attr-defined] False) object_to_transform["XRef"]["Trailer"]["ID"].append(random_id) object_to_transform["XRef"]["Trailer"]["ID"].append(random_id) else: object_to_transform["XRef"]["Trailer"]["ID"][1] = random_id # set CreationDate modification_date = WritePDFTransformer._timestamp_to_str() if "CreationDate" not in object_to_transform["XRef"]["Trailer"][Name( "Info")]: object_to_transform["XRef"]["Trailer"][Name("Info")][Name( "CreationDate")] = String(modification_date) # set ModDate object_to_transform["XRef"]["Trailer"]["Info"][Name( "ModDate")] = String(modification_date) # set Producer object_to_transform["XRef"]["Trailer"]["Info"][Name( "Producer")] = String("pText") # transform XREF self.get_root_transformer().transform(object_to_transform["XRef"], context)
def read_dictionary(self) -> Dictionary: """ This method processes the next tokens and returns a PDFDictionary. It fails and throws various errors if the next tokens do not represent a PDFDictionary. """ token = self.next_non_comment_token() if token is None: raise PDFEOFError() if token.token_type != TokenType.START_DICT: raise PDFSyntaxError(message="invalid dictionary", byte_offset=token.byte_offset) out_dict = Dictionary() while True: # attempt to read name token token = self.next_non_comment_token() if token is None: raise PDFEOFError() if token.token_type == TokenType.END_DICT: break if token.token_type != TokenType.NAME: raise PDFSyntaxError( message="dictionary key must be a name", byte_offset=token.byte_offset, ) # store name name = Name(token.text[1:]) # attempt to read value value = self.read_object() if value is None: raise PDFSyntaxError( message="unexpected end of dictionary", byte_offset=token.byte_offset, ) # store in dict object if name is not None: out_dict[name] = value return out_dict
def read( self, io_source: Union[io.BufferedIOBase, io.RawIOBase], tokenizer: HighLevelTokenizer, initial_offset: Optional[int] = None, ) -> "XREF": if initial_offset is not None: io_source.seek(initial_offset) else: self._seek_to_xref_token(io_source, tokenizer) xref_stream = tokenizer.read_object() assert isinstance(xref_stream, Stream) # check widths assert "W" in xref_stream assert all([ isinstance(xref_stream["W"][x], Decimal) for x in range(0, len(xref_stream["W"])) ]) # decode widths widths = [ int(xref_stream["W"][x]) for x in range(0, len(xref_stream["W"])) ] total_entry_width = sum(widths) # parent document = self.get_root() # type: ignore [attr-defined] # list of references indirect_references = [ Reference( object_number=0, generation_number=65535, is_in_use=False, document=document, ) ] # check size assert "Size" in xref_stream assert isinstance(xref_stream["Size"], Decimal) # get size number_of_objects = int(xref_stream["Size"]) # index index = [] if "Index" in xref_stream: index = xref_stream["Index"] assert isinstance(index, List) assert len(index) % 2 == 0 assert isinstance(index[0], Decimal) assert isinstance(index[1], Decimal) else: index = [Decimal(0), Decimal(number_of_objects)] # apply filters xref_stream = decode_stream(xref_stream) # read every range specified in \Index xref_stream_decoded_bytes = xref_stream["DecodedBytes"] for idx in range(0, len(index), 2): start = int(index[idx]) length = int(index[idx + 1]) bptr = 0 for i in range(0, length): # object number object_number = start + i # read type type = 1 if widths[0] > 0: type = 0 for j in range(0, widths[0]): type = (type << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 2 field2 = 0 for j in range(0, widths[1]): field2 = (field2 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # read field 3 field3 = 0 for j in range(0, widths[2]): field3 = (field3 << 8) + (xref_stream_decoded_bytes[bptr] & 0xFF) bptr += 1 # check type assert type in [0, 1, 2] pdf_indirect_reference = None if type == 0: # type :The type of this entry, which shall be 0. Type 0 entries define # the linked list of free objects (corresponding to f entries in a # cross-reference table). # field2 : The object number of the next free object # field3 : The generation number to use if this object number is used again pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, is_in_use=False, ) if type == 1: # Type : The type of this entry, which shall be 1. Type 1 entries define # objects that are in use but are not compressed (corresponding # to n entries in a cross-reference table). # field2 : The byte offset of the object, starting from the beginning of the # file. # field3 : The generation number of the object. Default value: 0. pdf_indirect_reference = Reference( document=document, object_number=object_number, byte_offset=field2, generation_number=field3, ) if type == 2: # Type : The type of this entry, which shall be 2. Type 2 entries define # compressed objects. # field2 : The object number of the object stream in which this object is # stored. (The generation number of the object stream shall be # implicitly 0.) # field3 : The index of this object within the object stream. pdf_indirect_reference = Reference( document=document, object_number=object_number, generation_number=0, parent_stream_object_number=field2, index_in_parent_stream=field3, ) assert pdf_indirect_reference is not None # append existing_indirect_ref = next( iter([ x for x in indirect_references if x.object_number is not None and x.object_number == Decimal(object_number) ]), None, ) ref_is_in_reading_state = ( existing_indirect_ref is not None and existing_indirect_ref.is_in_use and existing_indirect_ref.generation_number == pdf_indirect_reference.generation_number) ref_is_first_encountered = existing_indirect_ref is None or ( not ref_is_in_reading_state and existing_indirect_ref.document is None) if ref_is_first_encountered: assert pdf_indirect_reference is not None indirect_references.append(pdf_indirect_reference) elif ref_is_in_reading_state: assert existing_indirect_ref is not None assert pdf_indirect_reference is not None existing_indirect_ref.index_in_parent_stream = ( pdf_indirect_reference.index_in_parent_stream) existing_indirect_ref.parent_stream_object_number = ( pdf_indirect_reference.parent_stream_object_number) # add section for r in indirect_references: self.append(r) # initialize trailer self["Trailer"] = Dictionary(xref_stream) # return return self
def _create_annotation( self, rectangle: Tuple[Decimal, Decimal, Decimal, Decimal], contents: Optional[str] = None, color: Optional[Color] = None, border_horizontal_corner_radius: Optional[Decimal] = None, border_vertical_corner_radius: Optional[Decimal] = None, border_width: Optional[Decimal] = None, ): annot = Dictionary() # (Optional) The type of PDF object that this dictionary describes; if # present, shall be Annot for an annotation dictionary. annot[Name("Type")] = Name("Annot") # (Required) The annotation rectangle, defining the location of the # annotation on the page in default user space units. annot[Name("Rect")] = List().set_can_be_referenced(False) # type: ignore [attr-defined] annot["Rect"].append(pDecimal(rectangle[0])) annot["Rect"].append(pDecimal(rectangle[1])) annot["Rect"].append(pDecimal(rectangle[2])) annot["Rect"].append(pDecimal(rectangle[3])) # (Optional) Text that shall be displayed for the annotation or, if this type of # annotation does not display text, an alternate description of the # annotation’s contents in human-readable form. In either case, this text is # useful when extracting the document’s contents in support of # accessibility to users with disabilities or for other purposes (see 14.9.3, # “Alternate Descriptions”). See 12.5.6, “Annotation Types” for more # details on the meaning of this entry for each annotation type. if contents is not None: annot[Name("Contents")] = String(contents) # (Optional except as noted below; PDF 1.3; not used in FDF files) An # indirect reference to the page object with which this annotation is # associated. # This entry shall be present in screen annotations associated with # rendition actions (PDF 1.5; see 12.5.6.18, “Screen Annotations” and # 12.6.4.13, “Rendition Actions”). annot[Name("P")] = self # (Optional; PDF 1.4) The annotation name, a text string uniquely # identifying it among all the annotations on its page. len_annots = len(self["Annots"]) if "Annots" in self else 0 annot[Name("NM")] = String("annotation-{0:03d}".format(len_annots)) # (Optional; PDF 1.1) The date and time when the annotation was most # recently modified. The format should be a date string as described in # 7.9.4, “Dates,” but conforming readers shall accept and display a string # in any format. annot[Name("M")] = String(self._timestamp_to_str()) # (Optional; PDF 1.1) A set of flags specifying various characteristics of # the annotation (see 12.5.3, “Annotation Flags”). Default value: 0. annot[Name("F")] = pDecimal(4) # (Optional; PDF 1.2) An appearance dictionary specifying how the # annotation shall be presented visually on the page (see 12.5.5, # “Appearance Streams”). Individual annotation handlers may ignore this # entry and provide their own appearances. # annot[Name("AP")] = None # (Required if the appearance dictionary AP contains one or more # subdictionaries; PDF 1.2) The annotation’s appearance state, which # selects the applicable appearance stream from an appearance # subdictionary (see Section 12.5.5, “Appearance Streams”). # annot[Name("AS")] = None # Optional) An array specifying the characteristics of the annotation’s # border, which shall be drawn as a rounded rectangle. # (PDF 1.0) The array consists of three numbers defining the horizontal # corner radius, vertical corner radius, and border width, all in default user # space units. If the corner radii are 0, the border has square (not rounded) # corners; if the border width is 0, no border is drawn. # (PDF 1.1) The array may have a fourth element, an optional dash array # defining a pattern of dashes and gaps that shall be used in drawing the # border. The dash array shall be specified in the same format as in the # line dash pattern parameter of the graphics state (see 8.4.3.6, “Line # Dash Pattern”). if ( border_horizontal_corner_radius is not None and border_vertical_corner_radius is not None and border_width is not None ): annot[Name("Border")] = List().set_can_be_referenced(False) # type: ignore [attr-defined] annot["Border"].append(pDecimal(border_horizontal_corner_radius)) annot["Border"].append(pDecimal(border_vertical_corner_radius)) annot["Border"].append(pDecimal(border_width)) # (Optional; PDF 1.1) An array of numbers in the range 0.0 to 1.0, # representing a colour used for the following purposes: # The background of the annotation’s icon when closed # The title bar of the annotation’s pop-up window # The border of a link annotation # The number of array elements determines the colour space in which the # colour shall be defined if color is not None: color_max = pDecimal(256) annot[Name("C")] = List().set_can_be_referenced(False) # type: ignore [attr-defined] annot["C"].append(pDecimal(color.to_rgb().red / color_max)) annot["C"].append(pDecimal(color.to_rgb().green / color_max)) annot["C"].append(pDecimal(color.to_rgb().blue / color_max)) # (Required if the annotation is a structural content item; PDF 1.3) The # integer key of the annotation’s entry in the structural parent tree (see # 14.7.4.4, “Finding Structure Elements from Content Items”) # annot[Name("StructParent")] = None # (Optional; PDF 1.5) An optional content group or optional content # membership dictionary (see 8.11, “Optional Content”) specifying the # optional content properties for the annotation. Before the annotation is # drawn, its visibility shall be determined based on this entry as well as the # annotation flags specified in the F entry (see 12.5.3, “Annotation Flags”). # If it is determined to be invisible, the annotation shall be skipped, as if it # were not in the document. # annot[Name("OC")] = None # return return annot
def read(self, io_source: io.IOBase) -> "Canvas": io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] while canvas_tokenizer.tell() != length: # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator candidate_ops = [ x for x in self.canvas_operators if x.get_text() == str(obj) ] if len(candidate_ops) == 1: operator = candidate_ops[0] if len(operand_stk) < operator.get_number_of_operands(): # if we are in a compatibility section ignore any possible mistake if self.in_compatibility_section: continue raise IllegalGraphicsStateError( message= "Unable to execute operator %s. Expected %d arguments, received %d." % ( operator.text, operator.get_number_of_operands(), len(operand_stk), )) operands: typing.List["CanvasOperator"] = [ ] # type: ignore [name-defined] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # append if "Instructions" not in self: self["Instructions"] = List().set_parent( self) # type: ignore [attr-defined] instruction_number = len(self["Instructions"]) instruction_dictionary = Dictionary() instruction_dictionary["Name"] = operator.get_text() instruction_dictionary["Args"] = List( ).set_parent( # type: ignore [attr-defined] instruction_dictionary) if len(operands) > 0: for i in range(0, len(operands)): instruction_dictionary["Args"].append(operands[i]) self["Instructions"].append(instruction_dictionary) # debug logger.debug("%d %s %s" % ( instruction_number, operator.text, str([str(x) for x in operands]), )) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # unknown operator if len(candidate_ops) == 0: # print("Missing OPERATOR %s" % obj) pass # return return self
def transform( self, object_to_transform: AnyPDFType, context: Optional[WriteTransformerContext] = None, ): assert isinstance(object_to_transform, XREF) assert "Trailer" in object_to_transform assert isinstance(object_to_transform["Trailer"], Dictionary) assert context is not None assert context.destination is not None # transform Trailer dictionary (replacing objects by references) trailer_out = Dictionary() # /Root trailer_out[Name("Root")] = self.get_reference( object_to_transform["Trailer"]["Root"], context) # /Info if "Info" in object_to_transform["Trailer"]: trailer_out[Name("Info")] = self.get_reference( object_to_transform["Trailer"]["Info"], context) # /Size if ("Trailer" in object_to_transform and "Size" in object_to_transform["Trailer"]): trailer_out[Name("Size")] = object_to_transform["Trailer"]["Size"] else: trailer_out[Name("Size")] = Decimal(0) # /ID if "ID" in object_to_transform["Trailer"]: trailer_out[Name("ID")] = self.get_reference( object_to_transform["Trailer"]["ID"], context) # write Root object self.get_root_transformer().transform( object_to_transform["Trailer"]["Root"], context) # write Info object if "Info" in object_to_transform["Trailer"]: self.get_root_transformer().transform( object_to_transform["Trailer"]["Info"], context) # write ID object if "ID" in object_to_transform["Trailer"]: self.get_root_transformer().transform( object_to_transform["Trailer"]["ID"], context) # write XREF start_of_xref = context.destination.tell() context.destination.write(bytes("xref\n", "latin1")) for section in self._section_xref(context): context.destination.write( bytes("%d %d\n" % (section[0].object_number, len(section)), "latin1")) for r in section: if r.is_in_use: context.destination.write( bytes("{0:010d} 00000 n\n".format(r.byte_offset), "latin1")) else: context.destination.write( bytes("{0:010d} 00000 f\n".format(r.byte_offset), "latin1")) # update Size trailer_out[Name("Size")] = Decimal( sum([len(v) for k, v in context.indirect_objects.items()])) # write Trailer context.destination.write(bytes("trailer\n", "latin1")) self.get_root_transformer().transform(trailer_out, context) context.destination.write(bytes("startxref\n", "latin1")) # write byte offset of last cross-reference section context.destination.write(bytes(str(start_of_xref) + "\n", "latin1")) # write EOF context.destination.write(bytes("%%EOF", "latin1"))
def read(self, io_source: io.IOBase) -> "Canvas": io_source.seek(0, os.SEEK_END) length = io_source.tell() io_source.seek(0) canvas_tokenizer = HighLevelTokenizer(io_source) # process content operand_stk = [] while canvas_tokenizer.tell() != length: # print("<canvas pos='%d' length='%d' percentage='%d'/>" % ( canvas_tokenizer.tell(), length, int(canvas_tokenizer.tell() * 100 / length))) # attempt to read object obj = canvas_tokenizer.read_object() if obj is None: break # push argument onto stack if not isinstance(obj, CanvasOperatorName): operand_stk.append(obj) continue # process operator operator = self.canvas_operators.get(obj, None) if operator is None: logger.debug("Missing operator %s" % obj) continue if not self.in_compatibility_section: assert len(operand_stk) >= operator.get_number_of_operands() operands: typing.List["CanvasOperator"] = [] # type: ignore [name-defined] for _ in range(0, operator.get_number_of_operands()): operands.insert(0, operand_stk.pop(-1)) # append if "Instructions" not in self: self["Instructions"] = List().set_parent(self) # type: ignore [attr-defined] instruction_number = len(self["Instructions"]) instruction_dictionary = Dictionary() instruction_dictionary["Name"] = operator.get_text() instruction_dictionary["Args"] = List().set_parent( # type: ignore [attr-defined] instruction_dictionary ) if len(operands) > 0: for i in range(0, len(operands)): instruction_dictionary["Args"].append(operands[i]) self["Instructions"].append(instruction_dictionary) # debug logger.debug( "%d %s %s" % ( instruction_number, operator.text, str([str(x) for x in operands]), ) ) # invoke try: operator.invoke(self, operands) except Exception as e: if not self.in_compatibility_section: raise e # return return self
def decode_stream(s: Stream) -> Stream: assert isinstance(s, Stream) assert "Bytes" in s # determine filter(s) to apply filters: typing.List[str] = [] if "Filter" in s: if isinstance(s["Filter"], List): filters = s["Filter"] else: filters = [s["Filter"]] decode_params: typing.List[Dictionary] = [] if "DecodeParms" in s: if isinstance(s["DecodeParms"], List): decode_params = s["DecodeParms"] else: assert s["DecodeParms"] is not None assert isinstance(s["DecodeParms"], Dictionary) decode_params = [s["DecodeParms"]] else: decode_params = [Dictionary() for x in range(0, len(filters))] # apply filter(s) transformed_bytes = s["Bytes"] for filter_index, filter_name in enumerate(filters): # FLATE if filter_name in ["FlateDecode", "Fl"]: transformed_bytes = FlateDecode.decode( bytes_in=transformed_bytes, columns=int(decode_params[filter_index].get( "Columns", Decimal(1))), predictor=int(decode_params[filter_index].get( "Predictor", Decimal(1))), bits_per_component=int(decode_params[filter_index].get( "BitsPerComponent", Decimal(8))), ) continue # ASCII85 if filter_name in ["ASCII85Decode"]: transformed_bytes = ASCII85Decode.decode(transformed_bytes) continue # LZW if filter_name in ["LZWDecode"]: transformed_bytes = LZWDecode.decode(transformed_bytes) continue # RunLengthDecode if filter_name in ["RunLengthDecode"]: transformed_bytes = RunLengthDecode.decode(transformed_bytes) continue # unknown filter raise PDFValueError( expected_value_description= "[/ASCII85Decode, /FlateDecode, /Fl, /LZWDecode, /RunLengthDecode]", received_value_description=str(filter_name), ) # set DecodedBytes s[Name("DecodedBytes")] = transformed_bytes # set Type if not yet set if "Type" not in s: s[Name("Type")] = Name("Stream") # return return s