Exemplo n.º 1
0
def extract(image):

    try:
        binary = ocrolib.read_image_binary(image)
        binary = 1-binary

        scale = psegutils.estimate_scale(binary)
        segmentation = compute_segmentation(binary,scale)

        # ...lines = compute_lines(segmentation,scale)

        # compute the reading order
        lines = psegutils.compute_lines(segmentation,scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs
        nlabels = amax(compute_segmentation)+1
        renumber = zeros(nlabels,'i')
        for i,v in enumerate(lsort): renumber[lines[v].label] = 0x010000+(i+1)
        segmentation = renumber[segmentation]

        outputdir = "http://127.0.0.1:5000/uploads/"
        
        lines = [lines[i] for i in lsort]
        ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)


        cleaned = ocrolib.remove_noise(binary,args.noise)
        for i,l in enumerate(lines):
            binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
            ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)
        #print "%6d"%i,fname,"%4.1f"%scale,len(lines)
    except:
        print ('error')
    def _process_segment(self, page_image, page, textregion, region_xywh,
                         page_id, input_file, n):
        LOG = getLogger('OcrdAnybaseocrTextline')
        #check for existing text lines and whether to overwrite them
        if textregion.get_TextLine():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextLines in region "%s"', page_id)
                textregion.set_TextLine([])
            else:
                LOG.warning('keeping existing TextLines in region "%s"',
                            page_id)
                return

        binary = ocrolib.pil2array(page_image)

        if len(binary.shape) > 2:
            binary = np.mean(binary, 2)
        binary = np.array(1 - binary / np.amax(binary), 'B')

        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']

        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning(str(scale) + ": bad scale; skipping!\n")
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("too many lines %i; skipping!\n",
                        (np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])

        for i, l in enumerate(lines):
            #LOG.info('check this: ')
            #LOG.info(type(l.bounds))
            #LOG.info(l.bounds)
            #line_points = np.where(l.mask==1)
            #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull
            #x,y = hull.exterior.coords.xy
            #LOG.info('hull coords x: ',x)
            #LOG.info('hull coords y: ',y)

            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                            [min_x, max_y]]

            #line_polygon = [x for x in zip(y, x)]
            line_polygon = coordinates_for_segment(line_polygon, page_image,
                                                   region_xywh)
            line_points = points_from_polygon(line_polygon)

            img = cleaned[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = 255 - img
            img = ocrolib.array2pil(img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                img,
                file_id + "_" + str(n) + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line_id = '%s_line%04d' % (page_id, i)
            line = TextLineType(custom='readingOrder {index:' + str(i) + ';}',
                                id=line_id,
                                Coords=CoordsType(line_points))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)
Exemplo n.º 3
0
def analyze_page_layout(binary, gray, rgb=None):
    hscale = 1.0  # Non-standard scaling of horizontal parameters.
    vscale = 1.0  # Non-standard scaling of vertical parameters.
    threshold = 0.2  # baseline threshold.
    usegauss = True  # Use gaussian instead of uniform.
    maxseps = 0  # Maximum black column separators.
    sepwiden = 10  # Widen black separators (to account for warping).
    blackseps = True
    maxcolseps = 3  # Maximum # whitespace column separators.
    csminheight = 10  # Minimum column height (units=scale).
    noise = 8  # Noise threshold for removing small components from lines.
    gray_output = True  # Output grayscale lines as well, which are extracted from the grayscale version of the pages.
    pad = 3  # Padding for extracted lines.
    expand = 3  # Expand mask for grayscale extraction.

    if False:
        bin_image_filepath = './ocropy_test.bin.png'
        gray_image_filepath = './ocropy_test.nrm.png'

        binary = ocrolib.read_image_binary(bin_image_filepath)
        gray = ocrolib.read_image_gray(gray_image_filepath)

    binary = 1 - binary  # Invert.

    scale = psegutils.estimate_scale(binary)
    segmentation = compute_segmentation(binary,
                                        scale,
                                        blackseps,
                                        maxseps,
                                        maxcolseps,
                                        csminheight,
                                        sepwiden,
                                        usegauss,
                                        vscale,
                                        hscale,
                                        threshold,
                                        quiet=True)

    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # Renumber the labels so that they conform to the specs.
    nlabels = np.amax(segmentation) + 1
    renumber = np.zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]  # Image.

    lines = [lines[i] for i in lsort]

    # Visualize bounding boxes.
    if False:
        if rgb is not None:
            # REF [function] >> extract_masked() in ${OCROPY_HOME}/ocrolib/psegutils.py.
            for l in lines:
                y0, x0, y1, x1 = [
                    int(x) for x in [
                        l.bounds[0].start, l.bounds[1].start, l.bounds[0].stop,
                        l.bounds[1].stop
                    ]
                ]
                cv2.rectangle(rgb, (x0, y0), (x1, y1), (0, 0, 255), 1,
                              cv2.LINE_AA)
            cv2.imshow('Image', rgb)
            cv2.waitKey(0)

    # Output everything.
    if False:
        if not os.path.exists(outputdir):
            os.mkdir(outputdir)

        ocrolib.write_page_segmentation("%s.pseg.png" % outputdir,
                                        segmentation)
        cleaned = ocrolib.remove_noise(binary, noise)
        for i, l in enumerate(lines):
            binline = psegutils.extract_masked(1 - cleaned,
                                               l,
                                               pad=pad,
                                               expand=expand)  # Image.
            ocrolib.write_image_binary(
                "%s/01%04x.bin.png" % (outputdir, i + 1), binline)
            if gray_output:
                grayline = psegutils.extract_masked(gray,
                                                    l,
                                                    pad=pad,
                                                    expand=expand)  # Image.
                ocrolib.write_image_gray(
                    "%s/01%04x.nrm.png" % (outputdir, i + 1), grayline)
Exemplo n.º 4
0
def processPngFile(outRoot, origFile, fileNum):
    baseName = os.path.basename(origFile)
    baseBase, _ = os.path.splitext(baseName)
    outDir = os.path.join(outRoot, "%s.%03d" % (baseBase, fileNum))
    inFile = os.path.join(outDir, baseName)

    os.makedirs(outDir, exist_ok=True)
    shutil.copy(origFile, inFile)

    inBase, _ = ocrolib.allsplitext(inFile)
    print("**  inBase=%s" % inBase)
    # print("** binBase=%s" % binBase)

    fname = inFile
    outputdir = inBase
    binFile = inBase + ".bin.png"
    outFile = inBase + ".out.png"
    outRoot2, outDir2 = os.path.split(outRoot)
    outFile2 = os.path.join(outRoot2, "%s.out" % outDir2, baseName)
    print("outFile2=%s" % outFile2)
    # assert False
    grayFile = inBase + ".nrm.png"
    psegFile = inBase + ".pseg.png"
    print("  inFile=%s" % inFile)
    print(" binFile=%s" % binFile)
    print("grayFile=%s" % grayFile)
    print(" outFile=%s" % outFile)
    assert inFile and binFile
    assert outFile != inFile
    assert outFile != binFile

    if not binarize(inFile, binFile, grayFile):
        binExists = os.path.exists(binFile)
        print("Couldn't binarize inFile=%s binFile=%s exists=%s" %
              (inFile, binFile, binExists))
        return False

    binary = ocrolib.read_image_binary(binFile)
    print("$$ %s=%s" % (binFile, desc(binary)))
    height, width = binary.shape
    checktype(binary, ABINARY2)
    check = check_page(np.amax(binary) - binary)
    if check is not None:
        print("%s SKIPPED %s (use -n to disable this check)" % (inFile, check))
        return False

    # if args.gray:
    #     if os.path.exists(base+".nrm.png"):
    #         gray = ocrolib.read_image_gray(base+".nrm.png")
    #         checktype(gray, GRAYSCALE)
    #     else:
    #         print_error("Grayscale version %s.nrm.png not found. Use ocropus-nlbin for creating " +
    #                     "normalized grayscale version of the pages as well." % base)
    #         return

    binary = 1 - binary  # invert

    scale = psegutils.estimate_scale(binary)
    print("scale %f" % scale)
    if np.isnan(scale) or scale > 1000.0:
        print("%s: bad scale (%g); skipping\n" % (fname, scale))
        return False

    # find columns and text lines
    print("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if np.amax(segmentation) > maxlines:
        print("%s: too many lines %g" % (fname, np.amax(segmentation)))
        return False

    print("segmentation=%s" % desc(segmentation))
    print("number of lines %g" % np.amax(segmentation))

    # compute the reading order
    print("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)
    print("$$ lsort = %d = %s...%s" % (len(lsort), lsort[:10], lsort[-10:]))

    # renumber the labels so that they conform to the specs
    nlabels = np.amax(segmentation) + 1
    renumber = np.zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything
    print("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, noise)
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=pad,
                                           expand=expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1),
                                   binline)
        # if args.gray:
        #     grayline = psegutils.extract_masked(
        #         gray, l, pad=args.pad, expand=args.expand)
        #     ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i+1), grayline)
    print("%6d  %s %4.1f %d" % (i, fname, scale, len(lines)))

    # to proceed, we need a pseg file and a subdirectory containing text lines
    assert os.path.exists(psegFile), "%s: no such file" % psegFile
    assert os.path.isdir(inBase), "%s: no such directory" % inBase

    # iterate through the text lines in reading order, based on the page segmentation file
    pseg = ocrolib.read_page_segmentation(psegFile)
    print("$$ %s=%s" % (psegFile, desc(pseg)))

    regions = ocrolib.RegionExtractor()
    print("$$ regions=%s" % regions)
    regions.setPageLines(pseg)

    im = Image.open(inFile)
    print("~~%s %s" % (inFile, im.size))
    print("$$ regions=%s=%s" % (regions, sorted(regions.__dict__)))
    print("$$ regions.length=%s" % regions.length())

    n = regions.length()
    for i in range(1, n):

        id = regions.id(i)
        y0, x0, y1, x1 = regions.bbox(i)
        # print("%5d: 0x%05X %s %d x %d" %
        #       (i, id, [y0, x0, y1, x1], y1 - y0, x1 - x0))

        draw = ImageDraw.Draw(im)
        draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=3)
        draw.rectangle((x0, y0, x1, y1), outline=(0, 0, 255), width=0)
        # draw.rectangle((x0, y0, x1, y1), outline=255, width=5)
        # draw.rectangle((x0, y0, x1, y1), outline=10,  width=1)
        del draw

    # write output files
    print("outFile=%s" % outFile)
    im.save(outFile, "PNG")
    print("outFile2=%s" % outFile2)
    outDir2 = os.path.dirname(outFile2)
    os.makedirs(outDir2, exist_ok=True)
    im.save(outFile2, "PNG")
    assert os.path.exists(outFile2)
    # outFile3, _ = os.path.splitext(outFile)
    # outFile3 = "%s.jpg" % outFile3
    # print("outFile3=%s" % outFile3)
    # im.save(outFile3, "JPEG")
    # assert os.path.exists(outFile3)
    return True
Exemplo n.º 5
0
def process(job):
    imagepath, i = job
    global base
    base, _ = ocrolib.allsplitext(imagepath)
    outputdir = base
    imagename_base = os.path.basename(os.path.normpath(base))

    try:
        binary = ocrolib.read_image_binary(imagepath)
    except IOError:
        if ocrolib.trace: traceback.print_exc()
        print_error("cannot open either %s.bin.png or %s" % (base, imagepath))
        return

    checktype(binary, ABINARY2)

    if not args['nocheck']:
        check = check_page(amax(binary) - binary)
        if check is not None:
            print_error("%s SKIPPED %s (use -n to disable this check)" %
                        (imagepath, check))
            return

    binary = 1 - binary  # invert

    if args['scale'] == 0:
        scale = psegutils.estimate_scale(binary)
    else:
        scale = args['scale']
    print_info("scale %f" % (scale))
    if isnan(scale) or scale > 1000.0:
        print_error("%s: bad scale (%g); skipping\n" % (imagepath, scale))
        return
    if scale < args['minscale']:
        print_error("%s: scale (%g) less than --minscale; skipping\n" %
                    (imagepath, scale))
        return

    # find columns and text lines

    if not args['quiet']: print_info("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if amax(segmentation) > args['maxlines']:
        print_error("%s: too many lines %g" % (imagepath, amax(segmentation)))
        return
    if not args['quiet']: print_info("number of lines %g" % amax(segmentation))

    # compute the reading order

    if not args['quiet']: print_info("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # renumber the labels so that they conform to the specs

    nlabels = amax(segmentation) + 1
    renumber = zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything
    if not args['quiet']: print_info("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, args['noise'])
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=args['pad'],
                                           expand=args['expand'])
        ocrolib.write_image_binary(
            "%s/%s_01%04x.bin.png" % (outputdir, imagename_base, i + 1),
            binline)
    print_info("%6d  %s %4.1f %d" % (i, imagepath, scale, len(lines)))
    return outputdir
Exemplo n.º 6
0
def process1(job):
    fname, i = job
    global base
    base, _ = ocrolib.allsplitext(fname)
    outputdir = base

    try:
        binary = ocrolib.read_image_binary(base + ".bin.png")
    except IOError:
        try:
            binary = ocrolib.read_image_binary(fname)
        except IOError:
            if ocrolib.trace:
                traceback.print_exc()
            print("cannot open either", base + ".bin.png", "or", fname)
            return

    checktype(binary, ABINARY2)

    if not args.nocheck:
        check = check_page(amax(binary) - binary)
        if check is not None:
            print(fname, "SKIPPED", check, "(use -n to disable this check)")
            return

    if args.gray:
        if os.path.exists(base + ".nrm.png"):
            gray = ocrolib.read_image_gray(base + ".nrm.png")
        checktype(gray, GRAYSCALE)

    binary = 1 - binary  # invert

    if args.scale == 0:
        scale = psegutils.estimate_scale(binary)
    else:
        scale = args.scale
    print("scale", scale)
    if isnan(scale) or scale > 1000.0:
        sys.stderr.write("%s: bad scale (%g); skipping\n" % (fname, scale))
        return
    if scale < args.minscale:
        sys.stderr.write("%s: scale (%g) less than --minscale; skipping\n" %
                         (fname, scale))
        return

    # find columns and text lines

    if not args.quiet:
        print("computing segmentation")
    segmentation = compute_segmentation(binary, scale)
    if amax(segmentation) > args.maxlines:
        print(fname, ": too many lines", amax(segmentation))
        return
    if not args.quiet:
        print("number of lines", amax(segmentation))

    # compute the reading order

    if not args.quiet:
        print("finding reading order")
    lines = psegutils.compute_lines(segmentation, scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # renumber the labels so that they conform to the specs

    nlabels = amax(segmentation) + 1
    renumber = zeros(nlabels, 'i')
    for i, v in enumerate(lsort):
        renumber[lines[v].label] = 0x010000 + (i + 1)
    segmentation = renumber[segmentation]

    # finally, output everything

    if not args.quiet:
        print("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation)
    cleaned = ocrolib.remove_noise(binary, args.noise)
    for i, l in enumerate(lines):
        binline = psegutils.extract_masked(1 - cleaned,
                                           l,
                                           pad=args.pad,
                                           expand=args.expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1),
                                   binline)
        if args.gray:
            grayline = psegutils.extract_masked(gray,
                                                l,
                                                pad=args.pad,
                                                expand=args.expand)
            ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i + 1),
                                     grayline)
    print("%6d" % i, fname, "%4.1f" % scale, len(lines))
Exemplo n.º 7
0
    def _process_segment(self, page_image, page, region_xywh, page_id,
                         input_file, n):
        binary = ocrolib.pil2array(page_image)
        binary = np.array(1 - binary / np.amax(binary), 'B')
        if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1:
            min_x, max_x = (0, binary.shape[0])
            min_y, max_y = (0, binary.shape[1])
            textregion = TextRegionType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            page.add_TextRegion(textregion)
        else:
            textregion = page.get_TextRegion()[-1]
        ocrolib.write_image_binary("test.bin.png", binary)
        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']
        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale))
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("%s: too many lines %i",
                        (fname, np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])
        region_xywh['features'] += ",textline"
        for i, l in enumerate(lines):
            ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0],
                                                              l.bounds[1]])
            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            img = binary[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = ocrolib.array2pil(img)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line = TextLineType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)