Пример #1
0
def test_textline_reading_order():
    from gamera.core import load_image, init_gamera
    init_gamera()

    from gamera.plugins.pagesegmentation import textline_reading_order
    correct_orders = {
        "data/reading_order_2.png":
        [(42, 40, 1462, 114), (42, 158, 683, 232), (42, 276, 683, 350),
         (42, 418, 683, 492), (42, 560, 683, 633), (822, 158, 1462, 633),
         (42, 701, 1462, 775), (42, 843, 494, 917), (562, 843, 1132, 917),
         (42, 985, 683, 1059), (822, 985, 1132, 1059),
         (1200, 843, 1462, 1059)],
        "data/reading_order.png": [(51, 56, 1471, 130), (51, 174, 691, 248),
                                   (51, 292, 691, 366), (51, 434, 691, 508),
                                   (51, 576, 691, 649), (830, 174, 1471, 508),
                                   (830, 576, 1471, 649), (51, 717, 1471, 791),
                                   (51, 859, 691, 933), (51, 1001, 691, 1075),
                                   (830, 859, 1471, 933),
                                   (830, 1001, 1471, 1075)]
    }

    for file, correct in list(correct_orders.items()):
        img = load_image(file)
        ccs = img.cc_analysis()
        ro = textline_reading_order(ccs)

        result = [(a.ul_x, a.ul_y, a.lr_x, a.lr_y) for a in ro]
        assert result == correct
        del ro
        del ccs
        del img
Пример #2
0
def test_textline_reading_order():
   from gamera.plugins.pagesegmentation import textline_reading_order
   correct_orders = {"data/reading_order_2.png": [(42, 40, 1462, 114), (42, 158, 683, 232), (42, 276, 683, 350), (42, 418, 683, 492), (42, 560, 683, 633), (822, 158, 1462, 633), (42, 701, 1462, 775), (42, 843, 494, 917), (562, 843, 1132, 917), (42, 985, 683, 1059), (822, 985, 1132, 1059), (1200, 843, 1462, 1059)],
                     "data/reading_order.png": [(51, 56, 1471, 130), (51, 174, 691, 248), (51, 292, 691, 366), (51, 434, 691, 508), (51, 576, 691, 649), (830, 174, 1471, 508), (830, 576, 1471, 649), (51, 717, 1471, 791), (51, 859, 691, 933), (51, 1001, 691, 1075), (830, 859, 1471, 933), (830, 1001, 1471, 1075)]
                    }

   for file, correct in correct_orders.items():
      img = load_image(file)
      ccs = img.cc_analysis()
      ro = textline_reading_order(ccs)

      result = [(a.ul_x, a.ul_y, a.lr_x, a.lr_y) for a in ro]
      assert result == correct
      del ro
      del ccs
      del img
Пример #3
0
    def sort_glyphs(self):

        self.glyphs = textline_reading_order(self.glyphs)

        # begin calculating threshold for word-spacing
        spacelist = []
        for i in range(len(self.glyphs) - 1):
            spacelist.append(self.glyphs[i + 1].ul_x - self.glyphs[i].lr_x)
        if len(spacelist) > 0:
            threshold = median(spacelist)
            threshold = threshold * 2.0
        else:
            threshold = 0
        # end calculatin threshold for word-spacing

        self.words = chars_make_words(self.glyphs, threshold)
Пример #4
0
 def sort_glyphs(self):
    
    self.glyphs = textline_reading_order(self.glyphs)
    
    
    #begin calculating threshold for word-spacing
    spacelist = []
    for i in range(len(self.glyphs) - 1):
       spacelist.append(self.glyphs[i + 1].ul_x - self.glyphs[i].lr_x)
    if(len(spacelist) > 0):
       threshold = median(spacelist)
       threshold = threshold * 2.0
    else:
       threshold  = 0
    #end calculatin threshold for word-spacing
    
    self.words = chars_make_words(self.glyphs, threshold)
Пример #5
0
   def get_line_glyphs(self,image,textlines):
      i=0
      show = []
      lines = []
      ret,sub_ccs = image.sub_cc_analysis(textlines)
      #print "doc has %d lines" % len(sub_ccs)
      linenumber = 0
      for ccs in sub_ccs:
         linenumber = linenumber + 1
         #print "line %d" % linenumber
         line_bbox = Rect(textlines[i])
         i = i + 1
         glyphs = ccs[:]
         newlist = []

         remove = []
         add = []
         result = []
         glyphs.sort(lambda x,y: cmp(x.ul_x, y.ul_x))
         #print "first run"
         for position, item in enumerate(glyphs):
            olditem = item
            left = max(0,position - 5)
            right = min(position + 5, len(glyphs))
            
            checklist = glyphs[left:right]

            for glyph in checklist:
               if(item == glyph):
                  continue

               result = self.check_glyph_greek_accent(item,glyph)
               if(len(result[0]) > 0):  #something has been joind...
                  item = result[0][0]
                  #add.append(result[0][0])   #joind glyph
                  remove.append(result[1][0])        #first part of joind one
                  remove.append(result[1][1])        #second part of joind one
                  
               
            if olditem != item:
               add.append(item)
               
            for elem in remove:
               if(elem in glyphs):
                  glyphs.remove(elem)

         for elem in add:
            glyphs.append(elem)

         remove = []
         add = []
         glyphs = textline_reading_order(glyphs)
         
         
       
         glyphs = list(set(glyphs))
         #print len(glyphs)
         new_line = WholisticTextline(line_bbox)
         final = []
         if(len(glyphs) > 0):
           for glyph in glyphs:
            final.append(glyph)

         new_line.add_glyphs(final,False)
         #new_line.sort_glyphs()  #reading order -- from left to right
         lines.append(new_line)

         for glyph in glyphs:
           show.append(glyph)

      return lines
Пример #6
0
    def get_line_glyphs(self, image, textlines):
        i = 0
        show = []
        lines = []
        ret, sub_ccs = image.sub_cc_analysis(textlines)
        # print "doc has %d lines" % len(sub_ccs)
        linenumber = 0
        for ccs in sub_ccs:
            linenumber = linenumber + 1
            # print "line %d" % linenumber
            line_bbox = Rect(textlines[i])
            i = i + 1
            glyphs = ccs[:]
            newlist = []

            remove = []
            add = []
            result = []
            glyphs.sort(lambda x, y: cmp(x.ul_x, y.ul_x))
            # print "first run"
            for position, item in enumerate(glyphs):
                olditem = item
                left = max(0, position - 5)
                right = min(position + 5, len(glyphs))

                checklist = glyphs[left:right]

                for glyph in checklist:
                    if item == glyph:
                        continue

                    result = self.check_glyph_greek_accent(item, glyph)
                    if len(result[0]) > 0:  # something has been joind...
                        item = result[0][0]
                        # add.append(result[0][0])   #joind glyph
                        remove.append(result[1][0])  # first part of joind one
                        remove.append(result[1][1])  # second part of joind one

                if olditem != item:
                    add.append(item)

                for elem in remove:
                    if elem in glyphs:
                        glyphs.remove(elem)

            for elem in add:
                glyphs.append(elem)

            remove = []
            add = []
            glyphs = textline_reading_order(glyphs)

            glyphs = list(set(glyphs))
            # print len(glyphs)
            new_line = WholisticTextline(line_bbox)
            final = []
            if len(glyphs) > 0:
                for glyph in glyphs:
                    final.append(glyph)

            new_line.add_glyphs(final, False)
            # new_line.sort_glyphs()  #reading order -- from left to right
            lines.append(new_line)

            for glyph in glyphs:
                show.append(glyph)

        return lines