Пример #1
0
def text_like_histogram(img, area, average_size):
    if not arg.boolean_value('additional_filtering'):
        return True
    (x, y, w, h) = dimensions_2d_slice(area)
    x_subimage = np.copy(img)
    x_histogram = np.zeros(w, int)
    y_subimage = np.copy(img)
    y_histogram = np.zeros(h, int)

    aoi = img[area]

    ccs = cc.get_connected_components(aoi)
    if (len(ccs) < 2):
        return False

    #avg = average_size
    avg = cc.average_size(aoi)
    mean_width = cc.mean_width(aoi)
    mean_height = cc.mean_height(aoi)
    if arg.boolean_value('verbose'):
        print 'average size = ' + str(avg) + ' mean width = ' + str(
            mean_width) + ' mean height = ' + str(mean_height)
    if math.isnan(avg) or avg == 0:
        if arg.boolean_value('verbose'):
            print 'Rejecting area since average size is NaN'
        #return False

    #in a text area, the average size of a blob (cc) will reflect
    #that of the used characters/typeface. Thus if there simply aren't
    #enough pixels per character, we can drop this as a text candidate
    #note the following is failing in odd situations, probably due to incorrect
    #calculation of 'avg size'
    #TODO: replace testing against "average size" with testing against
    #hard thresholds for connected component width and height. i.e.
    #if they're all thin small ccs, we can drop this area

    #if avg < defaults.MINIMUM_TEXT_SIZE_THRESHOLD:
    if mean_width < defaults.MINIMUM_TEXT_SIZE_THRESHOLD or \
      mean_height < defaults.MINIMUM_TEXT_SIZE_THRESHOLD:
        if arg.boolean_value('verbose'):
            print 'Rejecting area since average width or height is less than threshold.'
        return False

    #check the basic aspect ratio of the ccs
    if mean_width / mean_height < 0.5 or mean_width / mean_height > 2:
        if arg.boolean_value('verbose'):
            print 'Rejecting area since mean cc aspect ratio not textlike.'
        return False

    width_multiplier = float(avg)
    height_multiplier = float(avg)

    #gaussian filter the subimages in x,y directions to emphasise peaks and troughs
    x_subimage = scipy.ndimage.filters.gaussian_filter(
        x_subimage, (0.01 * width_multiplier, 0))
    y_subimage = scipy.ndimage.filters.gaussian_filter(
        y_subimage, (0, 0.01 * height_multiplier))

    #put together the histogram for black pixels over the x directon (along columns) of the component
    for i, col in enumerate(range(x, x + w)):
        black_pixel_count = np.count_nonzero(y_subimage[y:y + h, col])
        x_histogram[i] = black_pixel_count

    #and along the y direction (along rows)
    for i, row in enumerate(range(y, y + h)):
        black_pixel_count = np.count_nonzero(x_subimage[row, x:x + w])
        y_histogram[i] = black_pixel_count

    h_white_runs = get_white_runs(x_histogram)
    num_h_white_runs = len(h_white_runs)
    h_black_runs = get_black_runs(x_histogram)
    num_h_black_runs = len(h_black_runs)
    (h_spacing_mean, h_spacing_variance) = slicing_list_stats(h_white_runs)
    (h_character_mean, h_character_variance) = slicing_list_stats(h_black_runs)
    v_white_runs = get_white_runs(y_histogram)
    num_v_white_runs = len(v_white_runs)
    v_black_runs = get_black_runs(y_histogram)
    num_v_black_runs = len(v_black_runs)
    (v_spacing_mean, v_spacing_variance) = slicing_list_stats(v_white_runs)
    (v_character_mean, v_character_variance) = slicing_list_stats(v_black_runs)

    if arg.boolean_value('verbose'):
        print 'x ' + str(x) + ' y ' + str(y) + ' w ' + str(w) + ' h ' + str(h)
        print 'white runs ' + str(len(h_white_runs)) + ' ' + str(
            len(v_white_runs))
        print 'white runs mean ' + str(h_spacing_mean) + ' ' + str(
            v_spacing_mean)
        print 'white runs std  ' + str(h_spacing_variance) + ' ' + str(
            v_spacing_variance)
        print 'black runs ' + str(len(h_black_runs)) + ' ' + str(
            len(v_black_runs))
        print 'black runs mean ' + str(h_character_mean) + ' ' + str(
            v_character_mean)
        print 'black runs std  ' + str(h_character_variance) + ' ' + str(
            v_character_variance)

    if num_h_white_runs < 2 and num_v_white_runs < 2:
        if arg.boolean_value('verbose'):
            print 'Rejecting area since not sufficient amount post filtering whitespace.'
        return False

    if v_spacing_variance > defaults.MAXIMUM_VERTICAL_SPACE_VARIANCE:
        if arg.boolean_value('verbose'):
            print 'Rejecting area since vertical inter-character space variance too high.'
        return False

    if v_character_mean < avg * 0.5 or v_character_mean > avg * 2.0:
        pass
        #return False
    if h_character_mean < avg * 0.5 or h_character_mean > avg * 2.0:
        pass
        #return False

    return True
Пример #2
0
def text_like_histogram(img, area, average_size):
  if not arg.boolean_value('additional_filtering'):
    return True
  (x, y, w, h) = dimensions_2d_slice(area)
  x_subimage = np.copy(img)
  x_histogram = np.zeros(w,int)
  y_subimage = np.copy(img)
  y_histogram = np.zeros(h,int)

  aoi = img[area]

  ccs = cc.get_connected_components(aoi)
  if( len(ccs) < 2):
    return False

  #avg = average_size
  avg = cc.average_size(aoi)
  mean_width = cc.mean_width(aoi)
  mean_height = cc.mean_height(aoi)
  if arg.boolean_value('verbose'):
    print 'average size = ' + str(avg) + ' mean width = ' + str(mean_width) + ' mean height = ' + str(mean_height)
  if math.isnan(avg) or avg==0:
    if arg.boolean_value('verbose'):
      print 'Rejecting area since average size is NaN'
    #return False

  #in a text area, the average size of a blob (cc) will reflect
  #that of the used characters/typeface. Thus if there simply aren't
  #enough pixels per character, we can drop this as a text candidate
  #note the following is failing in odd situations, probably due to incorrect
  #calculation of 'avg size'
  #TODO: replace testing against "average size" with testing against
  #hard thresholds for connected component width and height. i.e.
  #if they're all thin small ccs, we can drop this area

  #if avg < defaults.MINIMUM_TEXT_SIZE_THRESHOLD:
  if mean_width < defaults.MINIMUM_TEXT_SIZE_THRESHOLD or \
    mean_height < defaults.MINIMUM_TEXT_SIZE_THRESHOLD:
    if arg.boolean_value('verbose'):
      print 'Rejecting area since average width or height is less than threshold.'
    return False

  #check the basic aspect ratio of the ccs
  if mean_width/mean_height < 0.5 or mean_width/mean_height > 2:
    if arg.boolean_value('verbose'):
      print 'Rejecting area since mean cc aspect ratio not textlike.'
    return False

  width_multiplier = float(avg)
  height_multiplier = float(avg)

  #gaussian filter the subimages in x,y directions to emphasise peaks and troughs
  x_subimage  = scipy.ndimage.filters.gaussian_filter(x_subimage,(0.01*width_multiplier,0))
  y_subimage  = scipy.ndimage.filters.gaussian_filter(y_subimage,(0,0.01*height_multiplier))

  #put together the histogram for black pixels over the x directon (along columns) of the component
  for i,col in enumerate(range(x,x+w)):
    black_pixel_count = np.count_nonzero(y_subimage[y:y+h,col])
    x_histogram[i] = black_pixel_count

  #and along the y direction (along rows)
  for i,row in enumerate(range(y,y+h)):
    black_pixel_count = np.count_nonzero(x_subimage[row,x:x+w])
    y_histogram[i] = black_pixel_count
  
  h_white_runs = get_white_runs(x_histogram)
  num_h_white_runs = len(h_white_runs)
  h_black_runs = get_black_runs(x_histogram)
  num_h_black_runs = len(h_black_runs)
  (h_spacing_mean, h_spacing_variance) = slicing_list_stats(h_white_runs)
  (h_character_mean, h_character_variance) = slicing_list_stats(h_black_runs)
  v_white_runs = get_white_runs(y_histogram)
  num_v_white_runs = len(v_white_runs)
  v_black_runs = get_black_runs(y_histogram)
  num_v_black_runs = len(v_black_runs)
  (v_spacing_mean, v_spacing_variance) = slicing_list_stats(v_white_runs)
  (v_character_mean, v_character_variance) = slicing_list_stats(v_black_runs)

  if arg.boolean_value('verbose'):
    print 'x ' + str(x) + ' y ' +str(y) + ' w ' + str(w) + ' h ' + str(h)
    print 'white runs ' + str(len(h_white_runs)) + ' ' + str(len(v_white_runs))
    print 'white runs mean ' + str(h_spacing_mean) + ' ' + str(v_spacing_mean)
    print 'white runs std  ' + str(h_spacing_variance) + ' ' + str(v_spacing_variance)
    print 'black runs ' + str(len(h_black_runs)) + ' ' + str(len(v_black_runs))
    print 'black runs mean ' + str(h_character_mean) + ' ' + str(v_character_mean)
    print 'black runs std  ' + str(h_character_variance) + ' ' + str(v_character_variance)

  if num_h_white_runs < 2 and num_v_white_runs < 2:
    if arg.boolean_value('verbose'):
      print 'Rejecting area since not sufficient amount post filtering whitespace.'
    return False

  if v_spacing_variance > defaults.MAXIMUM_VERTICAL_SPACE_VARIANCE:
    if arg.boolean_value('verbose'):
      print 'Rejecting area since vertical inter-character space variance too high.'
    return False

  if v_character_mean < avg*0.5 or v_character_mean > avg*2.0:
    pass    
    #return False
  if h_character_mean < avg*0.5 or h_character_mean > avg*2.0:
    pass    
    #return False
  
  return True