def append_highlight_annotation( self, rectangle: Rectangle, color: Color = X11Color("Yellow"), contents: Optional[str] = None, ) -> "Page": # create generic annotation annot = self._create_annotation(rectangle=rectangle, color=color, contents=contents) # (Required) The type of annotation that this dictionary describes; shall # be Highlight, Underline, Squiggly, or StrikeOut for a highlight, # underline, squiggly-underline, or strikeout annotation, respectively. annot[Name("Subtype")] = Name("Highlight") # (Required) An array of 8 × n numbers specifying the coordinates of n # quadrilaterals in default user space. Each quadrilateral shall # encompasses a word or group of contiguous words in the text # underlying the annotation. The coordinates for each quadrilateral shall # be given in the order # x1 y1 x2 y2 x3 y3 x4 y4 annot[Name("QuadPoints")] = List().set_can_be_referenced( False) # type: ignore [attr-defined] # x1, y1 annot["QuadPoints"].append(pDecimal(rectangle.get_x())) annot["QuadPoints"].append(pDecimal(rectangle.get_y())) # x4, y4 annot["QuadPoints"].append(pDecimal(rectangle.get_x())) annot["QuadPoints"].append( pDecimal(rectangle.get_y() + rectangle.get_height())) # x2, y2 annot["QuadPoints"].append( pDecimal(rectangle.get_x() + rectangle.get_width())) annot["QuadPoints"].append(pDecimal(rectangle.get_y())) # x3, y3 annot["QuadPoints"].append( pDecimal(rectangle.get_x() + rectangle.get_width())) annot["QuadPoints"].append( pDecimal(rectangle.get_y() + rectangle.get_height())) # border annot[Name("Border")] = List().set_can_be_referenced( False) # type: ignore [attr-defined] annot["Border"].append(pDecimal(0)) annot["Border"].append(pDecimal(0)) annot["Border"].append(pDecimal(1)) # CA annot[Name("CA")] = pDecimal(1) # append to /Annots if "Annots" not in self: self[Name("Annots")] = List() assert isinstance(self["Annots"], List) self["Annots"].append(annot) # return return self
def test_extract_text_with_regex(self): l = RegularExpressionTextExtraction("[dD]ue [dD]ate [0-9]+/[0-9]+/[0-9]+") file: Path = Path("/home/joris/Code/pdf-corpus/0600.pdf") with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle, [l]) bounding_box: typing.Optional[Rectangle] = None output_file = self.output_dir / (file.stem + "_due_date_marked.pdf") with open(output_file, "wb") as pdf_file_handle: rects: typing.List[Rectangle] = [ x.get_bounding_box() for x in l.get_matched_chunk_of_text_render_events_per_page(0) ] bounding_box = self.bounding_box(rects) doc.get_page(0).append_polygon_annotation( LineArtFactory.rectangle(bounding_box), stroke_color=X11Color("Red"), ) PDF.dumps(pdf_file_handle, doc) # expand box a bit if bounding_box: p = Decimal(2) bounding_box = Rectangle( bounding_box.get_x() - p, bounding_box.get_y() - p, bounding_box.get_width() + 2 * p, bounding_box.get_height() + 2 * p, ) l1 = SimpleTextExtraction() l2 = LocationFilter( bounding_box.get_x(), bounding_box.get_y(), bounding_box.get_x() + bounding_box.get_width(), bounding_box.get_y() + bounding_box.get_height(), ).add_listener(l1) with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle, [l2]) print(l1.get_text(0))
def test_extract_text_in_area(self): r = Rectangle(Decimal(50), Decimal(400), Decimal(200), Decimal(100)) doc = None file: Path = Path("/home/joris/Code/pdf-corpus/0600.pdf") with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle) output_file = self.output_dir / (file.stem + "_bill_to_marked.pdf") with open(output_file, "wb") as pdf_file_handle: doc.get_page(0).append_polygon_annotation( LineArtFactory.rectangle(r), stroke_color=X11Color("Red"), ) PDF.dumps(pdf_file_handle, doc) l1 = SimpleTextExtraction() l2 = LocationFilter( r.get_x(), r.get_y(), r.get_x() + r.get_width(), r.get_y() + r.get_height() ).add_listener(l1) with open(file, "rb") as pdf_file_handle: doc = PDF.loads(pdf_file_handle, [l2]) print(l1.get_text(0))
def _create_annotation( self, rectangle: Rectangle, contents: Optional[str] = None, color: Optional[Color] = None, border_horizontal_corner_radius: Optional[Decimal] = None, border_vertical_corner_radius: Optional[Decimal] = None, border_width: Optional[Decimal] = None, ): annot = Dictionary() # (Optional) The type of PDF object that this dictionary describes; if # present, shall be Annot for an annotation dictionary. annot[Name("Type")] = Name("Annot") # (Required) The annotation rectangle, defining the location of the # annotation on the page in default user space units. annot[Name("Rect")] = List().set_can_be_referenced( False) # type: ignore [attr-defined] annot["Rect"].append(pDecimal(rectangle.get_x())) annot["Rect"].append(pDecimal(rectangle.get_y())) annot["Rect"].append( pDecimal(rectangle.get_x() + rectangle.get_width())) annot["Rect"].append( pDecimal(rectangle.get_y() + rectangle.get_height())) # (Optional) Text that shall be displayed for the annotation or, if this type of # annotation does not display text, an alternate description of the # annotation’s contents in human-readable form. In either case, this text is # useful when extracting the document’s contents in support of # accessibility to users with disabilities or for other purposes (see 14.9.3, # “Alternate Descriptions”). See 12.5.6, “Annotation Types” for more # details on the meaning of this entry for each annotation type. if contents is not None: annot[Name("Contents")] = String(contents) # (Optional except as noted below; PDF 1.3; not used in FDF files) An # indirect reference to the page object with which this annotation is # associated. # This entry shall be present in screen annotations associated with # rendition actions (PDF 1.5; see 12.5.6.18, “Screen Annotations” and # 12.6.4.13, “Rendition Actions”). annot[Name("P")] = self # (Optional; PDF 1.4) The annotation name, a text string uniquely # identifying it among all the annotations on its page. len_annots = len(self["Annots"]) if "Annots" in self else 0 annot[Name("NM")] = String("annotation-{0:03d}".format(len_annots)) # (Optional; PDF 1.1) The date and time when the annotation was most # recently modified. The format should be a date string as described in # 7.9.4, “Dates,” but conforming readers shall accept and display a string # in any format. annot[Name("M")] = String(self._timestamp_to_str()) # (Optional; PDF 1.1) A set of flags specifying various characteristics of # the annotation (see 12.5.3, “Annotation Flags”). Default value: 0. annot[Name("F")] = pDecimal(4) # (Optional; PDF 1.2) An appearance dictionary specifying how the # annotation shall be presented visually on the page (see 12.5.5, # “Appearance Streams”). Individual annotation handlers may ignore this # entry and provide their own appearances. # annot[Name("AP")] = None # (Required if the appearance dictionary AP contains one or more # subdictionaries; PDF 1.2) The annotation’s appearance state, which # selects the applicable appearance stream from an appearance # subdictionary (see Section 12.5.5, “Appearance Streams”). # annot[Name("AS")] = None # Optional) An array specifying the characteristics of the annotation’s # border, which shall be drawn as a rounded rectangle. # (PDF 1.0) The array consists of three numbers defining the horizontal # corner radius, vertical corner radius, and border width, all in default user # space units. If the corner radii are 0, the border has square (not rounded) # corners; if the border width is 0, no border is drawn. # (PDF 1.1) The array may have a fourth element, an optional dash array # defining a pattern of dashes and gaps that shall be used in drawing the # border. The dash array shall be specified in the same format as in the # line dash pattern parameter of the graphics state (see 8.4.3.6, “Line # Dash Pattern”). if (border_horizontal_corner_radius is not None and border_vertical_corner_radius is not None and border_width is not None): annot[Name("Border")] = List().set_can_be_referenced( False) # type: ignore [attr-defined] annot["Border"].append(pDecimal(border_horizontal_corner_radius)) annot["Border"].append(pDecimal(border_vertical_corner_radius)) annot["Border"].append(pDecimal(border_width)) # (Optional; PDF 1.1) An array of numbers in the range 0.0 to 1.0, # representing a colour used for the following purposes: # The background of the annotation’s icon when closed # The title bar of the annotation’s pop-up window # The border of a link annotation # The number of array elements determines the colour space in which the # colour shall be defined if color is not None: color_max = pDecimal(256) annot[Name("C")] = List().set_can_be_referenced( False) # type: ignore [attr-defined] annot["C"].append(pDecimal(color.to_rgb().red / color_max)) annot["C"].append(pDecimal(color.to_rgb().green / color_max)) annot["C"].append(pDecimal(color.to_rgb().blue / color_max)) # (Required if the annotation is a structural content item; PDF 1.3) The # integer key of the annotation’s entry in the structural parent tree (see # 14.7.4.4, “Finding Structure Elements from Content Items”) # annot[Name("StructParent")] = None # (Optional; PDF 1.5) An optional content group or optional content # membership dictionary (see 8.11, “Optional Content”) specifying the # optional content properties for the annotation. Before the annotation is # drawn, its visibility shall be determined based on this entry as well as the # annotation flags specified in the F entry (see 12.5.3, “Annotation Flags”). # If it is determined to be invisible, the annotation shall be skipped, as if it # were not in the document. # annot[Name("OC")] = None # return return annot