def test_textline_reading_order(): from gamera.core import load_image, init_gamera init_gamera() from gamera.plugins.pagesegmentation import textline_reading_order correct_orders = { "data/reading_order_2.png": [(42, 40, 1462, 114), (42, 158, 683, 232), (42, 276, 683, 350), (42, 418, 683, 492), (42, 560, 683, 633), (822, 158, 1462, 633), (42, 701, 1462, 775), (42, 843, 494, 917), (562, 843, 1132, 917), (42, 985, 683, 1059), (822, 985, 1132, 1059), (1200, 843, 1462, 1059)], "data/reading_order.png": [(51, 56, 1471, 130), (51, 174, 691, 248), (51, 292, 691, 366), (51, 434, 691, 508), (51, 576, 691, 649), (830, 174, 1471, 508), (830, 576, 1471, 649), (51, 717, 1471, 791), (51, 859, 691, 933), (51, 1001, 691, 1075), (830, 859, 1471, 933), (830, 1001, 1471, 1075)] } for file, correct in list(correct_orders.items()): img = load_image(file) ccs = img.cc_analysis() ro = textline_reading_order(ccs) result = [(a.ul_x, a.ul_y, a.lr_x, a.lr_y) for a in ro] assert result == correct del ro del ccs del img
def test_textline_reading_order(): from gamera.plugins.pagesegmentation import textline_reading_order correct_orders = {"data/reading_order_2.png": [(42, 40, 1462, 114), (42, 158, 683, 232), (42, 276, 683, 350), (42, 418, 683, 492), (42, 560, 683, 633), (822, 158, 1462, 633), (42, 701, 1462, 775), (42, 843, 494, 917), (562, 843, 1132, 917), (42, 985, 683, 1059), (822, 985, 1132, 1059), (1200, 843, 1462, 1059)], "data/reading_order.png": [(51, 56, 1471, 130), (51, 174, 691, 248), (51, 292, 691, 366), (51, 434, 691, 508), (51, 576, 691, 649), (830, 174, 1471, 508), (830, 576, 1471, 649), (51, 717, 1471, 791), (51, 859, 691, 933), (51, 1001, 691, 1075), (830, 859, 1471, 933), (830, 1001, 1471, 1075)] } for file, correct in correct_orders.items(): img = load_image(file) ccs = img.cc_analysis() ro = textline_reading_order(ccs) result = [(a.ul_x, a.ul_y, a.lr_x, a.lr_y) for a in ro] assert result == correct del ro del ccs del img
def sort_glyphs(self): self.glyphs = textline_reading_order(self.glyphs) # begin calculating threshold for word-spacing spacelist = [] for i in range(len(self.glyphs) - 1): spacelist.append(self.glyphs[i + 1].ul_x - self.glyphs[i].lr_x) if len(spacelist) > 0: threshold = median(spacelist) threshold = threshold * 2.0 else: threshold = 0 # end calculatin threshold for word-spacing self.words = chars_make_words(self.glyphs, threshold)
def sort_glyphs(self): self.glyphs = textline_reading_order(self.glyphs) #begin calculating threshold for word-spacing spacelist = [] for i in range(len(self.glyphs) - 1): spacelist.append(self.glyphs[i + 1].ul_x - self.glyphs[i].lr_x) if(len(spacelist) > 0): threshold = median(spacelist) threshold = threshold * 2.0 else: threshold = 0 #end calculatin threshold for word-spacing self.words = chars_make_words(self.glyphs, threshold)
def get_line_glyphs(self,image,textlines): i=0 show = [] lines = [] ret,sub_ccs = image.sub_cc_analysis(textlines) #print "doc has %d lines" % len(sub_ccs) linenumber = 0 for ccs in sub_ccs: linenumber = linenumber + 1 #print "line %d" % linenumber line_bbox = Rect(textlines[i]) i = i + 1 glyphs = ccs[:] newlist = [] remove = [] add = [] result = [] glyphs.sort(lambda x,y: cmp(x.ul_x, y.ul_x)) #print "first run" for position, item in enumerate(glyphs): olditem = item left = max(0,position - 5) right = min(position + 5, len(glyphs)) checklist = glyphs[left:right] for glyph in checklist: if(item == glyph): continue result = self.check_glyph_greek_accent(item,glyph) if(len(result[0]) > 0): #something has been joind... item = result[0][0] #add.append(result[0][0]) #joind glyph remove.append(result[1][0]) #first part of joind one remove.append(result[1][1]) #second part of joind one if olditem != item: add.append(item) for elem in remove: if(elem in glyphs): glyphs.remove(elem) for elem in add: glyphs.append(elem) remove = [] add = [] glyphs = textline_reading_order(glyphs) glyphs = list(set(glyphs)) #print len(glyphs) new_line = WholisticTextline(line_bbox) final = [] if(len(glyphs) > 0): for glyph in glyphs: final.append(glyph) new_line.add_glyphs(final,False) #new_line.sort_glyphs() #reading order -- from left to right lines.append(new_line) for glyph in glyphs: show.append(glyph) return lines
def get_line_glyphs(self, image, textlines): i = 0 show = [] lines = [] ret, sub_ccs = image.sub_cc_analysis(textlines) # print "doc has %d lines" % len(sub_ccs) linenumber = 0 for ccs in sub_ccs: linenumber = linenumber + 1 # print "line %d" % linenumber line_bbox = Rect(textlines[i]) i = i + 1 glyphs = ccs[:] newlist = [] remove = [] add = [] result = [] glyphs.sort(lambda x, y: cmp(x.ul_x, y.ul_x)) # print "first run" for position, item in enumerate(glyphs): olditem = item left = max(0, position - 5) right = min(position + 5, len(glyphs)) checklist = glyphs[left:right] for glyph in checklist: if item == glyph: continue result = self.check_glyph_greek_accent(item, glyph) if len(result[0]) > 0: # something has been joind... item = result[0][0] # add.append(result[0][0]) #joind glyph remove.append(result[1][0]) # first part of joind one remove.append(result[1][1]) # second part of joind one if olditem != item: add.append(item) for elem in remove: if elem in glyphs: glyphs.remove(elem) for elem in add: glyphs.append(elem) remove = [] add = [] glyphs = textline_reading_order(glyphs) glyphs = list(set(glyphs)) # print len(glyphs) new_line = WholisticTextline(line_bbox) final = [] if len(glyphs) > 0: for glyph in glyphs: final.append(glyph) new_line.add_glyphs(final, False) # new_line.sort_glyphs() #reading order -- from left to right lines.append(new_line) for glyph in glyphs: show.append(glyph) return lines