示例#1
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
示例#2
0
class OcropusSegmentPageBase(OcropusBase, base.JSONWriterMixin):
    """
    Segment an image using Ocropus.
    """
    abstract = True
    stage = stages.PAGE_SEGMENT
    intypes = [ocrolib.numpy.ndarray]
    outtype = dict

    def null_data(self):
        """
        Return an empty list when ignored.
        """
        return dict(columns=[], lines=[], paragraphs=[])

    def process(self, input):
        """
        Segment a binary image.

        input: a binary image.
        return: a dictionary of box types:
            lines
            paragraphs
            columns
            images
        """
        out = dict(bbox=[0, 0, input.shape[1], input.shape[0]],
                   columns=[],
                   lines=[],
                   paragraphs=[])
        try:
            page_seg = self._comp.segment(input)
        except (IndexError, TypeError, ValueError), err:
            raise OcropusNodeError(err.message, self)
        regions = ocrolib.RegionExtractor()
        exfuncs = dict(lines=regions.setPageLines,
                       paragraphs=regions.setPageParagraphs)
        # NB: These coordinates are relative to the TOP of the page
        # for some reason
        for box, func in exfuncs.iteritems():
            func(page_seg)
            for i in range(1, regions.length()):
                out[box].append((regions.x0(i), regions.y0(i), regions.x1(i),
                                 regions.y1(i)))
        return out
示例#3
0
 def __init__(self, *args, **kwargs):
     super(SegmentPageManual, self).__init__(*args, **kwargs)
     self._regions = ocrolib.RegionExtractor()
     self._segmenter = ocrolib.SegmentPageByRAST1()
示例#4
0
def processPngFile(outRoot, origFile, fileNum):
    baseName = os.path.basename(origFile)
    baseBase, _ = os.path.splitext(baseName)
    outDir = os.path.join(outRoot, "%s.%03d" % (baseBase, fileNum))
    inFile = os.path.join(outDir, baseName)

    os.makedirs(outDir, exist_ok=True)
    shutil.copy(origFile, inFile)

    inBase, _ = ocrolib.allsplitext(inFile)
    print("**  inBase=%s" % inBase)
    # print("** binBase=%s" % binBase)

    fname = inFile
    outputdir = inBase
    binFile = inBase + ".bin.png"
    outFile = inBase + ".out.png"
    outRoot2, outDir2 = os.path.split(outRoot)
    outFile2 = os.path.join(outRoot2, "%s.out" % outDir2, baseName)
    print("outFile2=%s" % outFile2)
    # assert False
    grayFile = inBase + ".nrm.png"
    psegFile = inBase + ".pseg.png"
    print("  inFile=%s" % inFile)
    print(" binFile=%s" % binFile)
    print("grayFile=%s" % grayFile)
    print(" outFile=%s" % outFile)
    assert inFile and binFile
    assert outFile != inFile
    assert outFile != binFile

    if not binarize(inFile, binFile, grayFile):
        binExists = os.path.exists(binFile)
        print("Couldn't binarize inFile=%s binFile=%s exists=%s" %
              (inFile, binFile, binExists))
        return False

    binary = ocrolib.read_image_binary(binFile)
    print("$$ %s=%s" % (binFile, desc(binary)))
    height, width = binary.shape
    checktype(binary, ABINARY2)
    check = check_page(np.amax(binary) - binary)
    if check is not None:
        print("%s SKIPPED %s (use -n to disable this check)" % (inFile, check))
        return False

    # if args.gray:
    #     if os.path.exists(base+".nrm.png"):
    #         gray = ocrolib.read_image_gray(base+".nrm.png")
    #         checktype(gray, GRAYSCALE)
    #     else:
    #         print_error("Grayscale version %s.nrm.png not found. Use ocropus-nlbin for creating " +
    #                     "normalized grayscale version of the pages as well." % base)
    #         return

    binary = 1 - binary  # invert

    scale = psegutils.estimate_scale(binary)
    print("scale %f" % scale)
    if np.isnan(scale) or scale > 1000.0:
        print("%s: bad scale (%g); skipping\n" % (fname, scale))
        return False

    # find columns and text lines
    print("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if np.amax(segmentation) > maxlines:
        print("%s: too many lines %g" % (fname, np.amax(segmentation)))
        return False

    print("segmentation=%s" % desc(segmentation))
    print("number of lines %g" % np.amax(segmentation))

    # compute the reading order
    print("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)
    print("$$ lsort = %d = %s...%s" % (len(lsort), lsort[:10], lsort[-10:]))

    # renumber the labels so that they conform to the specs
    nlabels = np.amax(segmentation) + 1
    renumber = np.zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything
    print("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, noise)
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=pad,
                                           expand=expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1),
                                   binline)
        # if args.gray:
        #     grayline = psegutils.extract_masked(
        #         gray, l, pad=args.pad, expand=args.expand)
        #     ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i+1), grayline)
    print("%6d  %s %4.1f %d" % (i, fname, scale, len(lines)))

    # to proceed, we need a pseg file and a subdirectory containing text lines
    assert os.path.exists(psegFile), "%s: no such file" % psegFile
    assert os.path.isdir(inBase), "%s: no such directory" % inBase

    # iterate through the text lines in reading order, based on the page segmentation file
    pseg = ocrolib.read_page_segmentation(psegFile)
    print("$$ %s=%s" % (psegFile, desc(pseg)))

    regions = ocrolib.RegionExtractor()
    print("$$ regions=%s" % regions)
    regions.setPageLines(pseg)

    im = Image.open(inFile)
    print("~~%s %s" % (inFile, im.size))
    print("$$ regions=%s=%s" % (regions, sorted(regions.__dict__)))
    print("$$ regions.length=%s" % regions.length())

    n = regions.length()
    for i in range(1, n):

        id = regions.id(i)
        y0, x0, y1, x1 = regions.bbox(i)
        # print("%5d: 0x%05X %s %d x %d" %
        #       (i, id, [y0, x0, y1, x1], y1 - y0, x1 - x0))

        draw = ImageDraw.Draw(im)
        draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=3)
        draw.rectangle((x0, y0, x1, y1), outline=(0, 0, 255), width=0)
        # draw.rectangle((x0, y0, x1, y1), outline=255, width=5)
        # draw.rectangle((x0, y0, x1, y1), outline=10,  width=1)
        del draw

    # write output files
    print("outFile=%s" % outFile)
    im.save(outFile, "PNG")
    print("outFile2=%s" % outFile2)
    outDir2 = os.path.dirname(outFile2)
    os.makedirs(outDir2, exist_ok=True)
    im.save(outFile2, "PNG")
    assert os.path.exists(outFile2)
    # outFile3, _ = os.path.splitext(outFile)
    # outFile3 = "%s.jpg" % outFile3
    # print("outFile3=%s" % outFile3)
    # im.save(outFile3, "JPEG")
    # assert os.path.exists(outFile3)
    return True
    def textline(self, arg):
        image = ocrolib.read_image_binary(arg)
        height, width = image.shape
        H = height
        W = width
        base, _ = ocrolib.allsplitext(arg)
        base2 = os.path.splitext(arg)[0]

        if not os.path.exists("%s/lines" % base):
            os.system("mkdir -p %s/lines" % base)
            #if os.path.exists(base2 + ".ts.png") :
            #    f = ocrolib.read_image_binary(base2 + ".ts.png")
            #    height, width = f.shape
            #    os.system("python "+args.libpath+"/anyBaseOCR-nlbin.py %s.pf.bin.png" % base2)
            #else:
            #    os.system("python "+args.libpath+"/anyBaseOCR-nlbin.py %s" % arg)
            #print("convert %s.ts.png %s/block-000.bin.png" % (base,base))
            #os.system("convert %s.ts.png %s/block-000.bin.png" % (base,base))
            #os.system("rm %s.bin.png %s.nrm.png" % (base, base))
            file = open('%s/sorted_cuts.dat' % base, 'w')
            l = "0 0 " + str(int(width)) + " " + str(
                int(height)) + " 0 0 0 0\n"
            file.write(l)
            file.close()

        #if not os.path.exists("%s/lines" % base) :
        #    os.system("mkdir %s/lines" % base)

        blockarray = []
        if os.path.exists(base + "/sorted_cuts.dat"):
            blocks = open(base + "/sorted_cuts.dat", "r")
            i = 0
            for block in blocks:
                words = block.split()
                blockarray.append((int(words[0]), -int(words[1]),
                                   int(words[2]), int(words[3]), i))
                i += 1
        else:
            blockarray.append((0, 0, width, height, 0))

        i = 0
        j = 0
        lines = []
        for block in blockarray:
            (x0, y0, x1, y1, i) = block
            y0 = -y0
            #blockImage = "%s/block-%03d" % (base, i)
            os.system("convert %s.ts.png %s/temp.png" % (base, base))
            img = Image.open("%s.ts.png" % base, 'r')
            img_w, img_h = img.size
            background = Image.new('RGBA', (W, H), (255, 255, 255, 255))
            bg_w, bg_h = background.size
            offX = (bg_w - img_w) // 2
            offY = (bg_h - img_h) // 2
            offset = (offX, offY)
            background.paste(img, offset)
            background.save("%s/temp.png" % base)
            command = "python " + self.param[
                'libpath'] + "/cli/anyBaseOCR-gpageseg.py %s/temp.png -n --minscale %f --maxlines %f --scale %f --hscale %f --vscale %f --threshold %f --noise %d --maxseps %d --sepwiden %d --maxcolseps %d --csminaspect %f --csminheight %f -p %d -e %d -Q %d" % (
                    base, self.param['minscale'], self.param['maxlines'], self.
                    param['scale'], self.param['hscale'], self.param['vscale'],
                    self.param['threshold'], self.param['noise'],
                    self.param['maxseps'], self.param['sepwiden'],
                    self.param['maxcolseps'], self.param['csminaspect'],
                    self.param['csminheight'], self.param['pad'],
                    self.param['expand'], self.param['parallel'])
            if (self.param['blackseps']):
                command = command + " -b"
            if (self.param['usegauss']):
                command = command + " --usegauss"
            os.system(command)
            pseg = ocrolib.read_page_segmentation("%s/temp.pseg.png" % base)
            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)
            file = open('%s/sorted_lines.dat' % base, 'w')
            for h in range(1, regions.length()):
                id = regions.id(h)
                y0, x0, y1, x1 = regions.bbox(h)
                l = str(int(x0 - offX)) + " " + str(
                    int(img_h -
                        (y1 - offY))) + " " + str(int(x1 - offX)) + " " + str(
                            int(img_h - (y0 - offY))) + " 0 0 0 0\n"
                file.write(l)
            filelist = glob.glob("%s/temp/*" % base)
            for infile in sorted(filelist):
                os.system("convert %s %s/lines/01%02x%02x.bin.png" %
                          (infile, base, i + 1, j + 1))
                lines.append("%s/lines/01%02x%02x.bin.png" %
                             (base, i + 1, j + 1))
                j += 1
            os.system("rm -r %s/temp/" % base)
            os.system("rm %s/temp.png %s/temp.pseg.png" % (base, base))
            i += 1
        return lines