示例#1
0
def my_filter(imageIn):
	MAX_CCS = 4000
	count = 0
	image = imageIn
	#imageIn.remove_border()
	ccs = image.cc_analysis()
	print "filter started on",len(ccs) ,"elements..."
	if len(ccs) < 1:
		raise ImageSegmentationError("there are no ccs")
	if len(ccs) > MAX_CCS:
		raise ImageSegmentationError("there are more than " + str(MAX_CCS) + " ccs.")
	median_black_area = median([cc.black_area()[0] for cc in ccs])
	#filter long vertical runs left over from margins
	median_height = median([cc.nrows for cc in ccs])
	for cc in ccs:
		if((cc.nrows / cc.ncols > 6) and (cc.nrows > 1.5 * median_height) ):
			cc.fill_white()
			del cc
			count = count + 1

	for cc in ccs:
		if(cc.black_area()[0] > (median_black_area * 10)):
			cc.fill_white()
			del cc
			count = count + 1
	for cc in ccs:
		if(cc.black_area()[0] < (median_black_area / 10)):
			cc.fill_white()
			del cc
			count = count + 1
	print "filter done.",len(ccs)-count,"elements left."
示例#2
0
def chars_make_words(lines_glyphs,threshold=None):
  """Groups the given glyphs to words based upon the horizontal distance
between adjacent glyphs.

Signature:
    ``chars_make_words (glyphs, threshold=None)``

with

    *glyphs*:
      A list of ``Cc`` data types, each of which representing a character.
      All glyphs must stem from the same single line of text.

    *threshold*:
      Horizontal white space greater than *threshold* will be considered
      a word separating gap. When ``None``, the threshold value is
      calculated automatically as 2.5 times teh median white space
      between adjacent glyphs.
  
The result is a nested list of glyphs with each sublist representing
a word. This is the same data structure as used in `Textline.words`_

.. _`Textline.words`: gamera.toolkits.ocr.classes.Textline.html
"""

  glyphs = lines_glyphs[:]
  wordlist = []
  
  if(threshold == None):
    spacelist = []
    total_space = 0
    for i in range(len(glyphs) - 1):
      spacelist.append(glyphs[i + 1].ul_x - glyphs[i].lr_x)
    if(len(spacelist) > 0):
      threshold = median(spacelist)
      threshold = threshold * 2.5
    else:
      threshold  = 0

  word = []
  for i in range(len(glyphs)):
    if i > 0:
      if((glyphs[i].ul_x - glyphs[i - 1].lr_x) > threshold):
        wordlist.append(word)
        word = []
    word.append(glyphs[i])

  if(len(word) > 0):
    wordlist.append(word)
  return wordlist
def my_filter(imageIn):
    count = 0
    image = imageIn
    ccs = image.cc_analysis()
    print "filter started on", len(ccs), "elements..."
    median_black_area = median([cc.black_area()[0] for cc in ccs])
    #also check for height?
    for cc in ccs:
        if (cc.black_area()[0] > (median_black_area * 10)):
            cc.fill_white()
            del cc
            count = count + 1
    for cc in ccs:
        if (cc.black_area()[0] < (median_black_area / 10)):
            cc.fill_white()
            del cc
            count = count + 1
    print "filter done.", len(ccs) - count, "elements left."
def my_filter(imageIn):
	count = 0
	image = imageIn
	ccs = image.cc_analysis()
	print "filter started on",len(ccs) ,"elements..."
	median_black_area = median([cc.black_area()[0] for cc in ccs])
	#also check for height?
	for cc in ccs:
		if(cc.black_area()[0] > (median_black_area * 10)):
			cc.fill_white()
			del cc
			count = count + 1
	for cc in ccs:
		if(cc.black_area()[0] < (median_black_area / 10)):
			cc.fill_white()
			del cc
			count = count + 1
	print "filter done.",len(ccs)-count,"elements left."
示例#5
0
def performGreekOCR(options):
   import mahotas as mh
   import numpy as np
   from pylab import imshow, gray, show
   #from os import path
   from gamera.plugins import numpy_io
#   features = ["aspect_ratio", "volume64regions", "moments", "nholes_extended"]   
#I think these are size-invariant
#   features = ["aspect_ratio","moments","nholes","nholes_extended","skeleton_features","top_bottom","volume","volume16regions","volume64regions","zernike_moments"]
   MAX_CCS = 6500
   features = ["aspect_ratio","moments","ncols_feature","nholes","nholes_extended","nrows_feature","skeleton_features","top_bottom","volume","volume16regions","volume64regions","zernike_moments"]
   image_files = []
   g = GreekOCR(splits=options["split"],feats=features)
   g.mode = options["mode"] + "body"
   g.autogroup = options["autogroup"]
   g.debug = options["debug"] 
   g.load_trainingdata(options['trainingdata'])
   g_appcrit = GreekOCR(splits=options["split"], feats=features)
   g_appcrit.mode = options["mode"] + "appcrit"
   g_appcrit.autogroup = options["autogroup"]
   g_appcrit.debug = options["debug"]
   g_appcrit.load_trainingdata(options['trainingdata'])
   
   if options["hocrfile"]:
      g.hocr = (options["hocrfile"])
   if options["settingsfile"]:
      g.load_settings(options["settingsfile"])
      g_appcrit.load_settings(options["settingsfile"])
   if options["otsu"]:
      otsu_factors_string = options["otsu"].split(',')
      otsu_factors = [float(x) for x in otsu_factors_string]
   else:
      otsu_factors = [0]
   if options["directory"]:
      image_files = os.listdir(options["directory"])
      image_files = [os.path.join(options["directory"],x) for x in image_files]
      test = re.compile(".png$",re.IGNORECASE)
      image_files = filter(test.search, image_files)
      image_files.sort()
   elif options["imagefile"]:
      image_files = options["imagefile"]
   image_file_count = 1;
   image_path = os.path.abspath(image_files[0])
   image_split_path = os.path.split(image_path)
   book_code = os.path.split(image_split_path[0])[1]
   book_id = 0
   if options.has_key("sql") and options["sql"]:
      book_id = sql_make_book_and_return_id(book_code)
  # if options.has_key("hocrout") and options["hocrout"]:
  #     hocr_tree = hocr_make_tree_and_return(book_code)
   for image_file in image_files:

      image_path = os.path.abspath(image_file)
      image_split_path = os.path.split(image_path)
      book_code = os.path.split(image_split_path[0])[1]#directory name
      image_file_name = image_split_path[1]
      imageBase, imageEx = os.path.splitext(image_file_name)
      threshold_info = ""
      print "Now working with image: " + image_file_name
      internal_image_file_path = os.path.join(book_code, image_file_name) 
      if imageEx == ".jp2":
         try:
            jp2Image = mh.imread(image_file, as_grey=True)
            jp2Image = jp2Image.astype(np.uint8)
            imageIn = numpy_io.from_numpy(jp2Image)
         except:
            print "Unexpected error:", sys.exc_info()[0]
            raise
      else:
         try:
            imageIn = load_image(image_file)
         except:
            continue
      imageType = imageIn.data.pixel_type
      if imageType != ONEBIT:
         if imageType != GREYSCALE:
            imageIn = imageIn.to_greyscale()
         otsu_thresh = imageIn.otsu_find_threshold()
      for otsu_factor in otsu_factors:
         if options.has_key("hocrout") and options["hocrout"]:
            hocr_tree = hocr_make_tree_and_return(book_code)
         if imageIn.data.pixel_type == ONEBIT:
            threshold_info = "thresh_128"
            otsu_thresh = 1.0
            image = imageIn
            if options["debug"]:
               print "image is ONEBIT; doing no threshold optimization."
         else:
            current_thresh = otsu_thresh * otsu_factor
            if current_thresh > 253.0:
               current_thresh = 253.0
            current_thresh = int(current_thresh)
            threshold_info = "thresh_" + str(int(current_thresh))# + "=" + str(otsu_factor)
            image = imageIn.threshold(current_thresh)
            print "Otsu factor: ", otsu_factor, " threshold: ", current_thresh
         if options["hocrfile"]:
            hocr_to_use = string.replace(options["hocrfile"],"%s",imageBase)
            g.hocr = hocr_to_use
            if options["debug"]:
               print "using '" + hocr_to_use + "' as hocr file"
         if options.has_key("filter") and options["filter"] == True:
             count = 0
             ccs = image.cc_analysis()
             if options.has_key("debug"):
                if options["debug"] == True:
                   print "filter started on",len(ccs) ,"elements..."
             #filter long vertical runs left over from margins
	     
	          
##               #Agressive run filtering
##               median_height = median([cc.nrows for cc in ccs])
##               for cc in ccs:
##               #TODO: add another condition that keeps these at edges of page
##                       if((cc.nrows / cc.ncols > 6) and (cc.nrows > 1.5 * median_height) ):
##                               cc.fill_white()
##                               del cc
##                               count = count + 1
             median_black_area = median([cc.black_area()[0] for cc in ccs])
             for cc in ccs:
               if(cc.black_area()[0] > (median_black_area * 10)):
                 cc.fill_white()
                 del cc
                 count = count + 1
             for cc in ccs:
               if(cc.black_area()[0] < (median_black_area / 10)):
                 cc.fill_white()
                 del cc
                 count = count + 1
             if options.has_key("debug") and options["debug"] == True:
                print "filter done.",len(ccs)-count,"elements left."
         if (len(ccs) < 5) or (len(ccs) > MAX_CCS):
                 print "Error: there are " + str(len(ccs)) +  " ccs. Max is " + str( MAX_CCS) +  " Omitting this image."
		 #raise ImageSegmentationError("Error: there are " + str(len(ccs)) +  " ccs. Max is " + str( MAX_CCS) +  " Omitting this image.")
         else:
            if options.has_key("deskew") and options["deskew"] == True:
              #from gamera.toolkits.otr.otr_staff import *
              if options.has_key("debug") and options["debug"] == True:
                print "\ntry to skew correct..."
              rotation = image.rotation_angle_projections(-10,10)[0]
              img = image.rotate(rotation,0)
              if options.has_key("debug") and options["debug"] == True:
                print "rotated with",rotation,"angle"
            if options.has_key("mode") and options["mode"] == "teubner":
               (body_image, app_crit_image) = splitAppCritTeubner(image)
               output = g.process_image(body_image)
               if app_crit_image:
                  print "there is an app. crit image"
                  appcrit_output = g_appcrit.process_image(app_crit_image)
               else:
                  print "there is no app. crit image"
                  appcrit_output = ""
               output = output + appcrit_output
            else:
               output = g.process_image(image) 
            output_file_name_base = options["unicodeoutfile"] + imageBase + "_" +imageEx[1:] + "_" + threshold_info
            if options.has_key("debug") and options["debug"] == True:
               g.save_debug_images(output_file_name_base)
               if options.has_key("mode") and options["mode"] == "teubner" and app_crit_image:
                  #TODO: make more general
                  g_appcrit.save_debug_images(output_file_name_base + "_appcrit")
            if options.has_key("hocrout") and options["hocrout"]:
               #if we turned this on, we would make a separate div for each page of input
               #hocr_tree = hocr_make_page_and_return_div(internal_image_file_path,image_file_count,book_id,hocr_tree)
               g.store_hocr(internal_image_file_path,hocr_tree)
               if options.has_key("mode") and options["mode"] == "teubner" and app_crit_image:
                  g_appcrit.store_hocr(internal_image_file_path,hocr_tree)
            if options.has_key("sql") and options["sql"]:
               page_id = sql_make_page_and_return_id(internal_image_file_path,image_file_count,book_id)
               g.store_sql(image_path,page_id) 
            if options.has_key("unicodeoutfile"):
                
               if options.has_key("hocrout") and options["hocrout"]:
                  g.save_text_hocr(hocr_tree, output_file_name_base + ".html")
               else:
                  g.save_text_unicode( output_file_name_base + ".txt")
                  if options.has_key("mode") and options["mode"] == "teubner":
                     #TODO: make the above more general
                     g_appcrit.save_text_unicode( output_file_name_base + "_appcrit.txt")
            elif options.has_key("teubneroutfile"):
               g.save_text_teubner(options["teubneroutfile"])
            else:
               print output
      image_file_count += 1
    def __call__(self, Ex=-1, Ey=-1, iterations=2):
        # bbox with contained cc indices
        class Bbox:
            def __init__(self, allccs, indices):
                self.ccs = allccs
                self.indices = indices
                if len(indices) == 1:
                    self.rect = Rect(allccs[indices[0]])
                else:
                    self.rect = allccs[indices[0]].union_images(
                        [allccs[i] for i in indices])

            def extend(self, Ex, Ey, img):
                ul_y = max(0, self.rect.ul_y - Ey)
                ul_x = max(0, self.rect.ul_x - Ex)
                lr_y = min(img.lr_y, self.rect.lr_y + Ey)
                lr_x = min(img.lr_x, self.rect.lr_x + Ex)
                nrows = lr_y - ul_y + 1
                ncols = lr_x - ul_x + 1
                self.rect = Rect(Point(ul_x, ul_y), Dim(ncols, nrows))

            def merge(self, other):
                self.indices += other.indices
                self.rect.union(other.rect)

        # does one merging step
        def merge_boxes(bboxes):
            from gamera import graph
            bboxes.sort(lambda b1, b2: b1.rect.ul_y - b2.rect.ul_y)
            g = graph.Graph(graph.UNDIRECTED)
            # build graph where edge means overlap of two boxes
            for i in range(len(bboxes)):
                g.add_node(i)
            for i in range(len(bboxes)):
                for j in range(i + 1, len(bboxes)):
                    if bboxes[j].rect.ul_y > bboxes[i].rect.lr_y:
                        break
                    if bboxes[i].rect.intersects(bboxes[j].rect):
                        if not g.has_edge(i, j):
                            g.add_edge(i, j)
            new_bboxes = []
            for sg in g.get_subgraph_roots():
                seg = [n() for n in g.BFS(sg)]
                bbox = bboxes[seg[0]]
                for i in range(1, len(seg)):
                    bbox.merge(bboxes[seg[i]])
                new_bboxes.append(bbox)
            return new_bboxes

        # the actual plugin
        from gamera.core import Dim, Rect, Point, Cc
        from gamera.plugins.listutilities import median

        page = self.image_copy()
        ccs = page.cc_analysis()

        # compute average CC size
        if Ex == -1:
            Ex = 2 * median([c.ncols for c in ccs])
        if Ey == -1:
            Ey = median([c.nrows for c in ccs])

        # create merged segments
        bboxes = [Bbox(ccs, [i]) for i in range(len(ccs))]
        for bb in bboxes:
            bb.extend(Ex, Ey, page)
        for i in range(iterations):
            oldlen = len(bboxes)
            bboxes = merge_boxes(bboxes)
            if oldlen == len(bboxes):
                break
        seg_ccs = []
        for i, bbox in enumerate(bboxes):
            label = i + 1
            ccs_of_segment = [ccs[j] for j in bbox.indices]
            for cc in ccs_of_segment:
                self.highlight(cc, label)
            seg_ccs.append(
                Cc(self, label, ccs_of_segment[0].union_rects(ccs_of_segment)))
        return seg_ccs
示例#7
0
    def __call__(self, Ex=-1, Ey=-1, iterations=2):
        # bbox with contained cc indices
        class Bbox:
            def __init__(self, allccs, indices):
                self.ccs = allccs
                self.indices = indices
                if len(indices) == 1:
                    self.rect = Rect(allccs[indices[0]])
                else:
                    self.rect = allccs[indices[0]].union_images([allccs[i] for i in indices])
            def extend(self, Ex, Ey, img):
                ul_y = max(0, self.rect.ul_y - Ey)
                ul_x = max(0, self.rect.ul_x - Ex)
                lr_y = min(img.lr_y, self.rect.lr_y + Ey)
                lr_x = min(img.lr_x, self.rect.lr_x + Ex)
                nrows = lr_y - ul_y + 1
                ncols = lr_x - ul_x + 1
                self.rect = Rect(Point(ul_x, ul_y), Dim(ncols, nrows))
            def merge(self, other):
                self.indices += other.indices
                self.rect.union(other.rect)
        # does one merging step
        def merge_boxes(bboxes):
            from gamera import graph
            bboxes.sort(lambda b1, b2: b1.rect.ul_y-b2.rect.ul_y)
            g = graph.Graph(graph.UNDIRECTED)
            # build graph where edge means overlap of two boxes
            for i in range(len(bboxes)):
                g.add_node(i)
            for i in range(len(bboxes)):
                for j in range(i+1, len(bboxes)):
                    if bboxes[j].rect.ul_y > bboxes[i].rect.lr_y:
                        break
                    if bboxes[i].rect.intersects(bboxes[j].rect):
                        if not g.has_edge(i,j):
                            g.add_edge(i,j)
            new_bboxes = []
            for sg in g.get_subgraph_roots():
                seg = [n() for n in g.BFS(sg)]
                bbox = bboxes[seg[0]]
                for i in range(1, len(seg)):
                    bbox.merge(bboxes[seg[i]])
                new_bboxes.append(bbox)
            return new_bboxes

        # the actual plugin
        from gamera.core import Dim, Rect, Point, Cc
        from gamera.plugins.listutilities import median

        page = self.image_copy()
        ccs = page.cc_analysis()

        # compute average CC size
        if Ex == -1:
            Ex = 2*median([c.ncols for c in ccs])
        if Ey == -1:
            Ey = median([c.nrows for c in ccs])

        # create merged segments
        bboxes = [Bbox(ccs, [i]) for i in range(len(ccs))]
        for bb in bboxes:
            bb.extend(Ex, Ey, page)
        for i in range(iterations):
            oldlen = len(bboxes)
            bboxes = merge_boxes(bboxes)
            if oldlen == len(bboxes):
                break
        seg_ccs = []
        for i,bbox in enumerate(bboxes):
            label = i+1
            ccs_of_segment = [ccs[j] for j in bbox.indices]
            for cc in ccs_of_segment:
                self.highlight(cc, label)
            seg_ccs.append(Cc(self, label, ccs_of_segment[0].union_rects(ccs_of_segment)))
        return seg_ccs