def rectangle_factory(bbox: Tuple[float, ...], seqno: int, fill: float) -> Rect: """Factory for making little rectangles with extra attributes""" r = Rect(*bbox) r.seqno = seqno r.fill = fill return r
def manga_to_PDF(dir_path, one_file=True, width=None, height=None): if dir_path[-1] == '\\' or dir_path[-1] == '/': dir_path = dir_path[0:-1] if not os.path.isdir(dir_path): raise ValueError('传入的路径并非文件夹') from fitz import Document, Pixmap, Rect from glob import _iglob as glob if one_file: title = os.path.basename(dir_path) with Document() as doc: for file_path in glob(os.path.join(dir_path, "*", "*.jpg"), False, False): pixmap = Pixmap(file_path) if width and height: pixmap = Pixmap(pixmap, width, height, None) elif width: pixmap = Pixmap(pixmap, width, int(pixmap.height / pixmap.width * width), None) elif height: pixmap = Pixmap(pixmap, int(pixmap.width / pixmap.height * height), height, None) rect = Rect(0, 0, pixmap.width, pixmap.height) page = doc.newPage(width=pixmap.width, height=pixmap.height) page.insertImage(rect, pixmap=pixmap) doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True) else: for chap in glob(os.path.join(dir_path, "*"), False, True): title = os.path.basename(chap) with Document() as doc: for file_path in glob(os.path.join(chap, "*.jpg"), False, False): pixmap = Pixmap(file_path) if width and height: pixmap = Pixmap(pixmap, width, height, None) elif width: pixmap = Pixmap( pixmap, width, int(pixmap.height / pixmap.width * width), None) elif height: pixmap = Pixmap( pixmap, int(pixmap.width / pixmap.height * height), height, None) rect = Rect(0, 0, pixmap.width, pixmap.height) page = doc.newPage(width=pixmap.width, height=pixmap.height) page.insertImage(rect, pixmap=pixmap) doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)
def Page_Rect_get_Text(doc, page_num, rects, output): page = doc[page_num] words = page.getText("words") output.write("_" * 30 + "\n") output.write(f"page {page_num+1}\n") for i in range(rects.shape[0]): output.write("\n") rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3]) mywords = [w for w in words if Rect(w[:4]) in rect] mywords.sort(key=itemgetter(3, 0)) # sort by y1, x0 of the word rect group = groupby(mywords, key=itemgetter(3)) for y1, gwords in group: output.write(" ".join(w[4] for w in gwords).replace("\n", "")) output.write("\n")
def append_TOC(existingDoc, newDoc, entryname, filename, startPoint, path): # Appends the new page to the existing document that contains the table of contents existingDoc.insert_pdf(newDoc) TOC_page = existingDoc.load_page(page_id=0) # Generates the text entry for the new page TOC_page.insertText(startPoint, entryname, fontname="helv", fontsize=16, rotate=0) x_distance = (fitz.getTextlength(entryname, fontname="helv", fontsize=16)) + 105 targetPageNumber = existingDoc.page_count entrynumber = " %i" % targetPageNumber while(x_distance < 475): dotLocation = fitz.Point(x_distance, startPoint.y) TOC_page.insertText(dotLocation, ".", fontname="helv", fontsize=16, rotate=0) x_distance = x_distance + 5 TOC_page.insertText(dotLocation, entrynumber, fontname="helv", fontsize=16, rotate=0) # Creates the hyperlink for the newly appended page # When the entry is clicked on in the Table of Contents, user is sent to that particular page linkRect = Rect(100, startPoint.y-20, x_distance + 25, startPoint.y + 15) newLink = TOC_page.insert_link({'kind': 1, 'from': linkRect, 'type': 'goto', 'page': targetPageNumber-1, 'to': fitz.Point(0, 0), 'zoom': 0.0}) # Inserts the page number on the bottom of the newly appended page insertedPage = existingDoc.load_page(page_id=-1) pageNumberPoint = fitz.Point(294, 830) insertPageNumber = "%i" % targetPageNumber insertedPage.insertText(pageNumberPoint, insertPageNumber, fontname="Times-Roman", fontsize=14, rotate=0) # Calculates the new start point for the next entry and saves the pdf newStartPoint = fitz.Point(100, startPoint.y + 35) existingDoc.save(path + filename) return newStartPoint
def get_main_bbox(bbox_1:fitz.Rect, bbox_2:fitz.Rect, threshold:float=0.95): ''' If the intersection of bbox_1 and bbox_2 exceeds the threshold, return the union of these two bbox-es; else return None. ''' # areas b = bbox_1 & bbox_2 a1, a2, a = bbox_1.getArea(), bbox_2.getArea(), b.getArea() # no intersection if not b: return fitz.Rect() # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.getArea()=0 # so give a small value when they're intersected but the area is zero factor = a/min(a1,a2) if a else 1e-6 if factor >= threshold: return bbox_1 | bbox_2 else: return fitz.Rect()
def Page_Rect_get_Text_odf(doc, name, page_num, rects, hierarchy, output, style_p): page = doc[page_num] words = page.getText("words") output.text.addElement(P(stylename=style_p, text="_" * 30)) output.text.addElement(P(stylename=style_p, text=f"page {page_num+1}")) for i in range(rects.shape[0]): if hierarchy[i, 3] == -1: output.text.addElement(P(stylename=style_p, text="")) rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3]) mywords = [w for w in words if Rect(w[:4]) in rect] mywords.sort(key=itemgetter(3, 0)) # sort by y1, x0 of the word rect group = groupby(mywords, key=itemgetter(3)) out_text = P(stylename=style_p, text="") for y1, gwords in group: out_text.addText(" ".join(w[4] for w in gwords).replace("\n", " ")) out_text.addText(" ") output.text.addElement(out_text) if hierarchy[i, 3] != -1: output.text.addElement(P(stylename=style_p, text="")) out_img = P() #ncc=int(hierarchy[i,3]) ncc = i clip = Rect(rects[ncc, 0], rects[ncc, 1], rects[ncc, 2], rects[ncc, 3]) pix = page.getPixmap(matrix=Matrix(2, 2), clip=clip) name_image = f"Pictures/image-{page.number}-{i}.png" #liste_tampon.append(name_tampon) #pix.writePNG(name_image) pix_png = pix.getPNGData() h = pix.height / pix.xres w = pix.width / pix.yres frame = Frame(width=f"{w}in", height=f"{h}in", anchortype="paragraph") href = output.addPicture(name_image, mediatype="png", content=pix_png) # frame.addElement(Image(href=f"./{href}")) out_img.addElement(frame) output.text.addElement(out_img) return output
def intersects( text_rect: Rect, rectangles: List[Rect], occlusion_threshold: float = 0.0, ) -> bool: """Determine if a rectangle intersects is occluded by a list of others This uses Rect objects, but note that they must have extra attributes of "fill" and "seqno". :param text_rect: The rectangle around the text to check for intersections. :param rectangles: A list of rectangles to check for intersections. :param occlusion_threshold: How much the rectangle must be occluded by at least one of the rectangles for it to be considered an intersection, as a percentage. E.g., 1.0 means that the bbox must be fully occluded, 0.10 means it must be 10% occluded. The default, 0.0, means they must intersect at least a little. :return True if any part of the bbox intersects with any of the rectangles, else False. """ for rect in rectangles + [text_rect]: assert all([hasattr(rect, "seqno"), hasattr(rect, "fill") ]), "Rectangle lacks required 'seqno' or 'fill' attribute." overlapping_areas = [] for rect in rectangles: intersecting_area = abs(text_rect & rect) if intersecting_area > 0 and rect.seqno > text_rect.seqno: # Intersecting text was drawn first, meaning it's behind the rect. overlapping_areas.append(intersecting_area) continue if intersecting_area > 0 and rect.fill == text_rect.fill: # Intersecting and same color. This makes text invisible even if # it's drawn on top of the rect. overlapping_areas.append(intersecting_area) continue if not overlapping_areas: return False greatest_occluded = max(overlapping_areas) area_of_bbox = abs(text_rect.get_area()) percent_occluded = greatest_occluded / area_of_bbox if percent_occluded > occlusion_threshold: return True return False
def Page_Get_Rects(page, col): if col == 1: x1, y1, x2, y2 = page.CropBox x2 /= 2 crop_rect = Rect(x1, y1, x2, y2) pix = page.getPixmap(clip=crop_rect) elif col == 2: x1, y1, x2, y2 = page.CropBox x1 = x2 / 2 crop_rect = Rect(x1, y1, x2, y2) pix = page.getPixmap(clip=crop_rect) else: pix = page.getPixmap(clip=page.CropBox) pix = pix.getPNGData() nparr = frombuffer(pix, uint8) img = 255 - imdecode(nparr, 0) #strategy is now to not print images too small #kernel = ones((3,3), uint8) #img=dilate(img,kernel,iterations = 2) #img=erode(img,kernel,iterations = 2) contour, hierarchy = findContours(img, RETR_CCOMP, CHAIN_APPROX_SIMPLE) nb_contour = len(contour) rects = empty((nb_contour, 4)) rects_sorted = empty((nb_contour, 4)) hierarchy_sorted = empty((nb_contour, 4)) #image with filled bounding rects img_rects = zeros_like(img) for i in range(nb_contour): rects[i] = boundingRect(contour[i]) x, y, w, h = rects[i].astype(int) img_rects = rectangle(img_rects, (x, y), (x + w, y + h), 255, -1) #some dilations of initial images to isolate part of the bounding rects that were not highlighted kernelh = ones((1, 3), uint8) img = dilate(img, kernelh, iterations=10) kernelv = ones((3, 1), uint8) img = dilate(img, kernelv, iterations=10) img3 = subtract(img_rects, img) #dilate the rectangles to exclude img3 = dilate(img3, kernelv, iterations=10) #contours to exclude Xcontour, Xhierarchy = findContours(img3, RETR_CCOMP, CHAIN_APPROX_SIMPLE) nb_Xcontour = len(Xcontour) Xrects = empty((nb_Xcontour, 4)) #bounding box of contours to exclude for i in range(nb_Xcontour): Xrects[i] = boundingRect(Xcontour[i]) rects[:, 2] = rects[:, 0] + rects[:, 2] rects[:, 3] = rects[:, 1] + rects[:, 3] ind_sorted = lexsort((rects[:, 0], rects[:, 1])) Xrects[:, 2] = Xrects[:, 0] + Xrects[:, 2] Xrects[:, 3] = Xrects[:, 1] + Xrects[:, 3] #no need to sort excluded contours we'll just iterate over all for i in range(nb_contour): rects_sorted[i] = rects[ind_sorted[i]] hierarchy_sorted[i] = hierarchy[0, ind_sorted[i], :] if col == 2: rects_sorted[:, 0] += x1 rects_sorted[:, 2] += x1 Xrects[:, 0] += x1 Xrects[:, 2] += x1 return rects_sorted, hierarchy_sorted, Xrects
def Page_Rect_get_Text_odf(doc, page_num, rects, hierarchy, Xrects, output, style_p, style_i, img_quality, col): page = doc[page_num] words = page.getText("words") output.text.addElement(P(stylename=style_p, text="_" * 60)) if col == 1 or col == 2: output.text.addElement( P(stylename=style_p, text=f"page {page_num+1} - column {col}")) else: output.text.addElement(P(stylename=style_p, text=f"page {page_num+1}")) for i in range(rects.shape[0]): if hierarchy[i, 3] == -1: rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3]) allwords = [w for w in words if Rect(w[:4]) in rect] # iterate over all rects to exclude mywords = [] for w in allwords: exclude = 0 for Xrect in Xrects: xg = (w[0] + w[2]) / 2 yg = (w[1] + w[3]) / 2 if Rect(Xrect).contains((xg, yg)): exclude = 1 if exclude == 0: mywords.append(w) mywords.sort(key=itemgetter(3, 0)) # sort by y1, x0 of the word rect group = groupby(mywords, key=itemgetter(3)) output.text.addElement(P(stylename=style_p, text="")) out_text = P(stylename=style_p, text="") for y1, gwords in group: out_text.addText(" ".join(w[4] for w in gwords).replace("\n", " ")) out_text.addText(" ") output.text.addElement(out_text) if hierarchy[i, 3] != -1: clip = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3]) #taking into account quality img_qual = img_quality / 50. pix = page.getPixmap(matrix=Matrix(img_qual, img_qual), clip=clip) name_image = f"Pictures/image-{page.number}-{col}{i}.png" pix_png = pix.getPNGData() h = pix.height / pix.xres w = pix.width / pix.yres #if quality is larger than 2 keep the frame the same as if it was 2 h *= 2 / img_qual w *= 2 / img_qual #if image is too small (h<20px) it is probably an artifact #so do not print it if pix.height * 2 / img_qual > 20: output.text.addElement(P(stylename=style_p, text="")) out_img = P() frame = Frame(stylename=style_i, width=f"{w}in", height=f"{h}in", anchortype="paragraph") href = output.addPicture(name_image, mediatype="png", content=pix_png) # frame.addElement(Image(href=f"./{href}")) out_img.addElement(frame) output.text.addElement(out_img) return output
def calculate_location_sign_company_first(self, rect: Rect): return Rect(rect.width - self.padding - 130 - self.width, self.padding + 150, rect.width - self.padding - 130, self.height + self.padding + 150)
def calculate_location_sign_company_second(self, rect: Rect): return Rect(self.padding + 140, rect.height - 40 - self.padding - self.height, self.width + self.padding + 140, rect.height - self.padding - 40)
def calculate_location_sign_user(self, react: Rect): return Rect(react.width - self.width - self.padding, react.height - self.width - self.padding, react.width - self.padding, react.height - self.padding)
def calculate_location_sign_company(self, *args, **kwargs): return Rect(self.padding, self.padding, self.width + self.padding, self.height + self.padding)