示例#1
0
 def split(self, output_dir, **kwargs):
     """Split an image that contains a table into per-cell images"""
     for box in self.get_boxes(**kwargs):
         col, row, left, upper, right, lower = box
         region = self.im.crop((left, upper, right, lower))
         region = self.transform(region, col, row)
         # Increase contrast.  Whiteish regions become absolute white
         region = threshold(region, 200)
         output_filename = cell_basename(self.filename, col, row) + ".tiff"
         output_path = os.path.join(output_dir, output_filename)
         region.save(output_path)
         md5 = md5sum(output_path)
         split_image.send(self, input_filename=self.filename,
             input_md5=self.md5, filename=output_filename, md5=md5,
             column=col, row=row, left=left, upper=upper, right=right,
             lower=lower)
示例#2
0
 def ocr_image(self, input_filename, num_cols, num_rows, split_dir, ocr_dir,
         user):
     for col in range(num_cols):
         for row in range(num_rows):
             basename = cell_basename(input_filename, col, row)
             img_filename = basename + '.tiff'
             path = os.path.join(split_dir, img_filename)
             outbase = os.path.join(ocr_dir, basename)
             if os.path.exists(path):
                 img_md5 = md5sum(path)
                 cmd = "tesseract {} {}".format(path, outbase)
                 cmd_args = shlex.split(cmd)
                 subprocess.check_call(cmd_args)
                 output_filename = outbase + '.txt'
                 with open(output_filename, 'r') as f:
                     text = f.read()
                     image_text.send(self, source_filename=img_filename, 
                         source_md5=img_md5, method='ocr',
                         text=text, user=user)