def rectangle_factory(bbox: Tuple[float, ...], seqno: int,
                      fill: float) -> Rect:
    """Factory for making little rectangles with extra attributes"""
    r = Rect(*bbox)
    r.seqno = seqno
    r.fill = fill
    return r
示例#2
0
def manga_to_PDF(dir_path, one_file=True, width=None, height=None):
    if dir_path[-1] == '\\' or dir_path[-1] == '/':
        dir_path = dir_path[0:-1]
    if not os.path.isdir(dir_path):
        raise ValueError('传入的路径并非文件夹')

    from fitz import Document, Pixmap, Rect
    from glob import _iglob as glob

    if one_file:
        title = os.path.basename(dir_path)
        with Document() as doc:
            for file_path in glob(os.path.join(dir_path, "*", "*.jpg"), False,
                                  False):
                pixmap = Pixmap(file_path)
                if width and height:
                    pixmap = Pixmap(pixmap, width, height, None)
                elif width:
                    pixmap = Pixmap(pixmap, width,
                                    int(pixmap.height / pixmap.width * width),
                                    None)
                elif height:
                    pixmap = Pixmap(pixmap,
                                    int(pixmap.width / pixmap.height * height),
                                    height, None)
                rect = Rect(0, 0, pixmap.width, pixmap.height)
                page = doc.newPage(width=pixmap.width, height=pixmap.height)
                page.insertImage(rect, pixmap=pixmap)
            doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)

    else:
        for chap in glob(os.path.join(dir_path, "*"), False, True):
            title = os.path.basename(chap)
            with Document() as doc:
                for file_path in glob(os.path.join(chap, "*.jpg"), False,
                                      False):
                    pixmap = Pixmap(file_path)
                    if width and height:
                        pixmap = Pixmap(pixmap, width, height, None)
                    elif width:
                        pixmap = Pixmap(
                            pixmap, width,
                            int(pixmap.height / pixmap.width * width), None)
                    elif height:
                        pixmap = Pixmap(
                            pixmap, int(pixmap.width / pixmap.height * height),
                            height, None)
                    rect = Rect(0, 0, pixmap.width, pixmap.height)
                    page = doc.newPage(width=pixmap.width,
                                       height=pixmap.height)
                    page.insertImage(rect, pixmap=pixmap)
                doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)
示例#3
0
def Page_Rect_get_Text(doc, page_num, rects, output):
    page = doc[page_num]
    words = page.getText("words")
    output.write("_" * 30 + "\n")
    output.write(f"page {page_num+1}\n")
    for i in range(rects.shape[0]):
        output.write("\n")
        rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3])
        mywords = [w for w in words if Rect(w[:4]) in rect]
        mywords.sort(key=itemgetter(3, 0))  # sort by y1, x0 of the word rect
        group = groupby(mywords, key=itemgetter(3))
        for y1, gwords in group:
            output.write(" ".join(w[4] for w in gwords).replace("\n", ""))
            output.write("\n")
示例#4
0
def append_TOC(existingDoc, newDoc, entryname, filename, startPoint, path):

    # Appends the new page to the existing document that contains the table of contents
    existingDoc.insert_pdf(newDoc)
    TOC_page = existingDoc.load_page(page_id=0)

    # Generates the text entry for the new page
    TOC_page.insertText(startPoint, entryname, fontname="helv", fontsize=16, rotate=0)
    x_distance = (fitz.getTextlength(entryname, fontname="helv", fontsize=16)) + 105
    targetPageNumber = existingDoc.page_count
    entrynumber = "   %i" % targetPageNumber
    while(x_distance < 475):
        dotLocation = fitz.Point(x_distance, startPoint.y)
        TOC_page.insertText(dotLocation, ".", fontname="helv", fontsize=16, rotate=0)
        x_distance = x_distance + 5
    TOC_page.insertText(dotLocation, entrynumber, fontname="helv", fontsize=16, rotate=0)

    # Creates the hyperlink for the newly appended page
    # When the entry is clicked on in the Table of Contents, user is sent to that particular page
    linkRect = Rect(100, startPoint.y-20, x_distance + 25, startPoint.y + 15)
    newLink = TOC_page.insert_link({'kind': 1, 'from': linkRect, 'type': 'goto', 'page': targetPageNumber-1, 'to': fitz.Point(0, 0), 'zoom': 0.0})

    # Inserts the page number on the bottom of the newly appended page
    insertedPage = existingDoc.load_page(page_id=-1)
    pageNumberPoint = fitz.Point(294, 830)
    insertPageNumber = "%i" % targetPageNumber
    insertedPage.insertText(pageNumberPoint, insertPageNumber, fontname="Times-Roman", fontsize=14, rotate=0)

    # Calculates the new start point for the next entry and saves the pdf
    newStartPoint = fitz.Point(100, startPoint.y + 35)
    existingDoc.save(path + filename)

    return newStartPoint
示例#5
0
def get_main_bbox(bbox_1:fitz.Rect, bbox_2:fitz.Rect, threshold:float=0.95):
    ''' If the intersection of bbox_1 and bbox_2 exceeds the threshold, return the union of
        these two bbox-es; else return None.
    '''
    # areas
    b = bbox_1 & bbox_2
    a1, a2, a = bbox_1.getArea(), bbox_2.getArea(), b.getArea()

    # no intersection
    if not b: return fitz.Rect()

    # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.getArea()=0
    # so give a small value when they're intersected but the area is zero
    factor = a/min(a1,a2) if a else 1e-6
    if factor >= threshold:
        return bbox_1 | bbox_2
    else:
        return fitz.Rect()
示例#6
0
def Page_Rect_get_Text_odf(doc, name, page_num, rects, hierarchy, output,
                           style_p):
    page = doc[page_num]
    words = page.getText("words")
    output.text.addElement(P(stylename=style_p, text="_" * 30))
    output.text.addElement(P(stylename=style_p, text=f"page {page_num+1}"))
    for i in range(rects.shape[0]):
        if hierarchy[i, 3] == -1:
            output.text.addElement(P(stylename=style_p, text=""))
            rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3])
            mywords = [w for w in words if Rect(w[:4]) in rect]
            mywords.sort(key=itemgetter(3,
                                        0))  # sort by y1, x0 of the word rect
            group = groupby(mywords, key=itemgetter(3))
            out_text = P(stylename=style_p, text="")
            for y1, gwords in group:
                out_text.addText(" ".join(w[4]
                                          for w in gwords).replace("\n", " "))
                out_text.addText(" ")
            output.text.addElement(out_text)
        if hierarchy[i, 3] != -1:
            output.text.addElement(P(stylename=style_p, text=""))
            out_img = P()
            #ncc=int(hierarchy[i,3])
            ncc = i
            clip = Rect(rects[ncc, 0], rects[ncc, 1], rects[ncc, 2], rects[ncc,
                                                                           3])
            pix = page.getPixmap(matrix=Matrix(2, 2), clip=clip)
            name_image = f"Pictures/image-{page.number}-{i}.png"
            #liste_tampon.append(name_tampon)
            #pix.writePNG(name_image)
            pix_png = pix.getPNGData()
            h = pix.height / pix.xres
            w = pix.width / pix.yres
            frame = Frame(width=f"{w}in",
                          height=f"{h}in",
                          anchortype="paragraph")
            href = output.addPicture(name_image,
                                     mediatype="png",
                                     content=pix_png)  #
            frame.addElement(Image(href=f"./{href}"))
            out_img.addElement(frame)
            output.text.addElement(out_img)
    return output
def intersects(
    text_rect: Rect,
    rectangles: List[Rect],
    occlusion_threshold: float = 0.0,
) -> bool:
    """Determine if a rectangle intersects is occluded by a list of others

    This uses Rect objects, but note that they must have extra attributes of
    "fill" and "seqno".

    :param text_rect: The rectangle around the text to check for intersections.
    :param rectangles: A list of rectangles to check for intersections.
    :param occlusion_threshold: How much the rectangle must be occluded by at
    least one of the rectangles for it to be considered an intersection, as a
    percentage. E.g., 1.0 means that the bbox must be fully occluded, 0.10
    means it must be 10% occluded. The default, 0.0, means they must intersect
    at least a little.
    :return True if any part of the bbox intersects with any of the rectangles,
    else False.
    """
    for rect in rectangles + [text_rect]:
        assert all([hasattr(rect, "seqno"),
                    hasattr(rect, "fill")
                    ]), "Rectangle lacks required 'seqno' or 'fill' attribute."

    overlapping_areas = []
    for rect in rectangles:
        intersecting_area = abs(text_rect & rect)
        if intersecting_area > 0 and rect.seqno > text_rect.seqno:
            # Intersecting text was drawn first, meaning it's behind the rect.
            overlapping_areas.append(intersecting_area)
            continue
        if intersecting_area > 0 and rect.fill == text_rect.fill:
            # Intersecting and same color. This makes text invisible even if
            # it's drawn on top of the rect.
            overlapping_areas.append(intersecting_area)
            continue

    if not overlapping_areas:
        return False

    greatest_occluded = max(overlapping_areas)
    area_of_bbox = abs(text_rect.get_area())

    percent_occluded = greatest_occluded / area_of_bbox
    if percent_occluded > occlusion_threshold:
        return True
    return False
示例#8
0
def Page_Get_Rects(page, col):
    if col == 1:
        x1, y1, x2, y2 = page.CropBox
        x2 /= 2
        crop_rect = Rect(x1, y1, x2, y2)
        pix = page.getPixmap(clip=crop_rect)

    elif col == 2:
        x1, y1, x2, y2 = page.CropBox
        x1 = x2 / 2
        crop_rect = Rect(x1, y1, x2, y2)
        pix = page.getPixmap(clip=crop_rect)

    else:
        pix = page.getPixmap(clip=page.CropBox)

    pix = pix.getPNGData()
    nparr = frombuffer(pix, uint8)
    img = 255 - imdecode(nparr, 0)

    #strategy is now to not print images too small
    #kernel = ones((3,3), uint8)
    #img=dilate(img,kernel,iterations = 2)
    #img=erode(img,kernel,iterations = 2)
    contour, hierarchy = findContours(img, RETR_CCOMP, CHAIN_APPROX_SIMPLE)

    nb_contour = len(contour)
    rects = empty((nb_contour, 4))
    rects_sorted = empty((nb_contour, 4))
    hierarchy_sorted = empty((nb_contour, 4))

    #image with filled bounding rects
    img_rects = zeros_like(img)

    for i in range(nb_contour):
        rects[i] = boundingRect(contour[i])
        x, y, w, h = rects[i].astype(int)
        img_rects = rectangle(img_rects, (x, y), (x + w, y + h), 255, -1)

    #some dilations of initial images to isolate part of the bounding rects that were not highlighted
    kernelh = ones((1, 3), uint8)
    img = dilate(img, kernelh, iterations=10)
    kernelv = ones((3, 1), uint8)
    img = dilate(img, kernelv, iterations=10)

    img3 = subtract(img_rects, img)
    #dilate the rectangles to exclude
    img3 = dilate(img3, kernelv, iterations=10)
    #contours to exclude
    Xcontour, Xhierarchy = findContours(img3, RETR_CCOMP, CHAIN_APPROX_SIMPLE)

    nb_Xcontour = len(Xcontour)
    Xrects = empty((nb_Xcontour, 4))
    #bounding box of contours to exclude
    for i in range(nb_Xcontour):
        Xrects[i] = boundingRect(Xcontour[i])

    rects[:, 2] = rects[:, 0] + rects[:, 2]
    rects[:, 3] = rects[:, 1] + rects[:, 3]
    ind_sorted = lexsort((rects[:, 0], rects[:, 1]))

    Xrects[:, 2] = Xrects[:, 0] + Xrects[:, 2]
    Xrects[:, 3] = Xrects[:, 1] + Xrects[:, 3]
    #no need to sort excluded contours we'll just iterate over all

    for i in range(nb_contour):
        rects_sorted[i] = rects[ind_sorted[i]]
        hierarchy_sorted[i] = hierarchy[0, ind_sorted[i], :]

    if col == 2:
        rects_sorted[:, 0] += x1
        rects_sorted[:, 2] += x1
        Xrects[:, 0] += x1
        Xrects[:, 2] += x1

    return rects_sorted, hierarchy_sorted, Xrects
示例#9
0
def Page_Rect_get_Text_odf(doc, page_num, rects, hierarchy, Xrects, output,
                           style_p, style_i, img_quality, col):
    page = doc[page_num]
    words = page.getText("words")
    output.text.addElement(P(stylename=style_p, text="_" * 60))
    if col == 1 or col == 2:
        output.text.addElement(
            P(stylename=style_p, text=f"page {page_num+1} - column {col}"))
    else:
        output.text.addElement(P(stylename=style_p, text=f"page {page_num+1}"))
    for i in range(rects.shape[0]):
        if hierarchy[i, 3] == -1:
            rect = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3])
            allwords = [w for w in words if Rect(w[:4]) in rect]
            # iterate over all rects to exclude
            mywords = []
            for w in allwords:
                exclude = 0
                for Xrect in Xrects:
                    xg = (w[0] + w[2]) / 2
                    yg = (w[1] + w[3]) / 2
                    if Rect(Xrect).contains((xg, yg)):
                        exclude = 1
                if exclude == 0:
                    mywords.append(w)

            mywords.sort(key=itemgetter(3,
                                        0))  # sort by y1, x0 of the word rect
            group = groupby(mywords, key=itemgetter(3))

            output.text.addElement(P(stylename=style_p, text=""))
            out_text = P(stylename=style_p, text="")
            for y1, gwords in group:
                out_text.addText(" ".join(w[4]
                                          for w in gwords).replace("\n", " "))
                out_text.addText(" ")
            output.text.addElement(out_text)
        if hierarchy[i, 3] != -1:

            clip = Rect(rects[i, 0], rects[i, 1], rects[i, 2], rects[i, 3])
            #taking into account quality
            img_qual = img_quality / 50.

            pix = page.getPixmap(matrix=Matrix(img_qual, img_qual), clip=clip)

            name_image = f"Pictures/image-{page.number}-{col}{i}.png"
            pix_png = pix.getPNGData()
            h = pix.height / pix.xres
            w = pix.width / pix.yres
            #if quality is larger than 2 keep the frame the same as if it was 2
            h *= 2 / img_qual
            w *= 2 / img_qual
            #if image is too small (h<20px) it is probably an artifact
            #so do not print it
            if pix.height * 2 / img_qual > 20:
                output.text.addElement(P(stylename=style_p, text=""))
                out_img = P()
                frame = Frame(stylename=style_i,
                              width=f"{w}in",
                              height=f"{h}in",
                              anchortype="paragraph")
                href = output.addPicture(name_image,
                                         mediatype="png",
                                         content=pix_png)  #
                frame.addElement(Image(href=f"./{href}"))
                out_img.addElement(frame)
                output.text.addElement(out_img)
    return output
示例#10
0
 def calculate_location_sign_company_first(self, rect: Rect):
     return Rect(rect.width - self.padding - 130 - self.width,
                 self.padding + 150, rect.width - self.padding - 130,
                 self.height + self.padding + 150)
示例#11
0
 def calculate_location_sign_company_second(self, rect: Rect):
     return Rect(self.padding + 140,
                 rect.height - 40 - self.padding - self.height,
                 self.width + self.padding + 140,
                 rect.height - self.padding - 40)
示例#12
0
 def calculate_location_sign_user(self, react: Rect):
     return Rect(react.width - self.width - self.padding,
                 react.height - self.width - self.padding,
                 react.width - self.padding, react.height - self.padding)
示例#13
0
 def calculate_location_sign_company(self, *args, **kwargs):
     return Rect(self.padding,
                 self.padding,
                 self.width + self.padding,
                 self.height + self.padding)